protk 1.4.4.beta2 → 1.4.4

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: f278e7fe8e3a0955907a13952ced4fa0d772c204
4
- data.tar.gz: 513fb214683486b9d3596a9b9f0956107d611170
3
+ metadata.gz: 1b59cd3751adc7a13c6dce81a90c2a4d739e1efe
4
+ data.tar.gz: 3ac1aba71a95b729101a6c64699cebaf1929fe34
5
5
  SHA512:
6
- metadata.gz: e59fcb0724cbf42b8f63e65ca6d88fd91c7d45d3981964358c82e040f9235e2ce6768251d088966044827cadf5c9ce46f189345d3c27293d0b10b44daf018c10
7
- data.tar.gz: 36329dca4cf416fc2b9bd6f0d395895ee2fd32fdba2f6ebd75c5d26ed21b497dc890cfc6bcf36b1f08bc7f3954bae76adea64450a431d6ac365a2930191997cc
6
+ metadata.gz: 6ab966131e53b6e379ba1df7717cc71e0a7ead11b603d16d474f4fdce487623546ca472c8624421d62af1e238f7e2917f8ed1e66c52a82d0fc4798cd7b1fdf6b
7
+ data.tar.gz: 05b98a731eb99063f17942fed42dce0e450488e61b11452b918269bd27788a300d7d2ebf4c2a67930c14bc5291dfab7bf8b13b992300356e96b0d274e68c6d96
data/README.md CHANGED
@@ -16,10 +16,16 @@ Protk is a suite of tools for proteomics. It aims to present a simple and consis
16
16
 
17
17
  Protk is a ruby gem and requires ruby 2.0 or higher with support for libxml2. To avoid installation problems we recommend using [rvm](https://rvm.io) to install ruby.
18
18
 
19
- ``` shell
19
+ ```shell
20
20
  gem install protk
21
21
  ```
22
22
 
23
+ On macOS you may need to do
24
+
25
+ ```bash
26
+ brew install coreutils
27
+ ```
28
+
23
29
  ## Ruby Compatibility
24
30
 
25
31
  In general Protk requires ruby with a version >=2.0.
@@ -0,0 +1,148 @@
1
+ #!/usr/bin/env ruby
2
+ #
3
+ # This file is part of protk
4
+ # Created by Ira Cooke 4/9/2013
5
+ #
6
+ #
7
+
8
+ require 'protk/constants'
9
+ require 'protk/tool'
10
+ require 'protk/gff_to_proteindb_tool'
11
+ require 'bio'
12
+
13
+ tool=GffToProteinDBTool.new([:explicit_output,:debug,:add_transcript_info])
14
+ tool.option_parser.banner = "Create a protein database from Maker gene prediction \
15
+ output that is suitable for later processing by proteogenomics tools.\
16
+ \n\nUsage: maker_to_proteindb.rb [options] maker.gff3"
17
+
18
+ tool.add_value_option(:proteins_file,nil,['-p', '--prot-fasta proteins', 'A fasta file \
19
+ containing protein sequences for each transcript'])
20
+
21
+ # tool.add_value_option(:explicit_output,nil,['-o', '--output out', 'An explicitly named output file. \
22
+ # The default is to write to standard output'])
23
+
24
+ exit unless tool.check_options(true)
25
+
26
+ inname=ARGV.shift
27
+
28
+ $protk = Constants.instance
29
+ log_level = tool.debug ? :debug : :fatal
30
+ $protk.info_level= log_level
31
+
32
+ tool.print_progress=true
33
+
34
+ outfile=nil
35
+ if ( tool.explicit_output != nil)
36
+ outfile=File.open(tool.explicit_output,'w')
37
+ else
38
+ outfile=$stdout
39
+ tool.print_progress=false
40
+ end
41
+
42
+ gene_lines=[]
43
+
44
+ def get_protein_sequence(transcript_id,proteins_file)
45
+ %x[samtools faidx #{proteins_file} #{transcript_id} | tail -n +2]
46
+ end
47
+
48
+ def cds_to_header_text(coding_sequence,transcript_id)
49
+ # require 'debugger';debugger
50
+ imatch=coding_sequence.match(/CDS\t(\d+)\t(\d+).*?([-\+]{1}.*?Parent=#{transcript_id})$/)
51
+ if imatch==nil
52
+ return ""
53
+ end
54
+ istart=imatch[1]
55
+ iend=imatch[2]
56
+ "#{istart}|#{iend}"
57
+ end
58
+
59
+ def sequence_fasta_header(tool,transcript_line,coding_sequences)
60
+
61
+ tmatch=transcript_line.match(/mRNA\t(\d+)\t(\d+).*?([-\+]{1}).*?ID=(.*?);/)
62
+ # require 'debugger'; debugger
63
+ tstart,tend,tstrand = transcript_line.match(/mRNA\t(\d+)\t(\d+).*?([-\+]{1})/).captures
64
+
65
+ # tstart=tmatch[1]
66
+ # tend=tmatch[2]
67
+ # tsidfield = transcript_line.split("\t")[8]
68
+
69
+ tid = transcript_line.match(/ID=([^;]+)/).captures[0]
70
+ # if tsidfield =~ /ID=/
71
+ # tid = tsidfield.match(/ID=(.*?);/).captures[0]
72
+ # else
73
+ # tid = tsidfield.gsub(" ","_").gsub(";","_")
74
+ # end
75
+
76
+ # require 'byebug';byebug
77
+
78
+ tstrandfr="fwd"
79
+ tstrandfr = "rev" if tstrand=="-"
80
+
81
+ scaffold=transcript_line.split("\t")[0]
82
+
83
+ # tid=tmatch[4]
84
+ header=">lcl|#{scaffold}_#{tstrandfr}_#{tid} #{tstart}|#{tend}"
85
+ if tool.add_transcript_info
86
+ coding_sequences.each { |coding_sequence| header << " #{cds_to_header_text(coding_sequence,tid)}" }
87
+ end
88
+ header
89
+ end
90
+
91
+ def protein_sequence(protein_lines)
92
+ seq=""
93
+ protein_lines.each_with_index do |line, i|
94
+ seq << line.match(/(\w+)\]?$/)[1]
95
+ end
96
+
97
+ seq
98
+ end
99
+
100
+
101
+ def parse_gene(tool,gene_lines)
102
+
103
+ # require 'byebug';byebug
104
+ geneid=gene_lines[0].match(/ID=([^;]+)/).captures[0]
105
+
106
+ scaffold_id = gene_lines[1].split("\t")[0]
107
+
108
+ transcripts=tool.get_lines_matching(/mRNA/,gene_lines)
109
+
110
+ coding_sequences=tool.get_lines_matching(/CDS/,gene_lines)
111
+
112
+ fasta_string=""
113
+
114
+ transcripts.each_with_index do |ts, i|
115
+
116
+ prot_id=ts.match(/ID=([^;]+)/).captures[0]
117
+
118
+ begin
119
+ fh=sequence_fasta_header(tool,ts,coding_sequences)
120
+ fasta_string << "#{fh}\n"
121
+ ps=get_protein_sequence(prot_id,tool.proteins_file)
122
+ fasta_string << "#{ps}"
123
+ rescue => e
124
+ $protk.log "Unable to retrieve protein for #{prot_id} #{e}" , :debug
125
+ end
126
+ end
127
+
128
+ fasta_string
129
+ end
130
+
131
+
132
+
133
+ File.open(inname).each_with_index do |line, line_i|
134
+ line.chomp!
135
+
136
+ if tool.start_new_gene(line)
137
+ if gene_lines.length > 0
138
+ gene_string=parse_gene(tool,gene_lines)
139
+ outfile.write gene_string
140
+ gene_lines=[]
141
+ end
142
+ end
143
+
144
+ if line =~ /maker/
145
+ gene_lines << line
146
+ end
147
+
148
+ end
@@ -93,11 +93,13 @@ if for_galaxy || Pathname.new(database_path).extname.to_s.downcase != ".fasta"
93
93
  # database_path="#{database_path}.fasta"
94
94
  end
95
95
 
96
+ db_noext = "#{Pathname.new(database_path).sub_ext('')}"
97
+
96
98
  # Database must be indexed
97
- unless FileTest.exists?("#{database_path}.canno")
98
- # dbdir = Pathname.new(database_path).dirname.to_s
99
+ unless FileTest.exists?("#{db_noext}.canno")
99
100
  tdavalue=search_tool.decoy_search ? 1 : 0;
100
- make_msgfdb_cmd << "java -Xmx3500M -cp #{genv.msgfplusjar} edu.ucsd.msjava.msdbsearch.BuildSA -d #{database_path} -tda #{tdavalue}; "
101
+ genv.log "Database index not found at #{db_noext}. Building new index" , :info
102
+ make_msgfdb_cmd << "java -Xmx#{search_tool.java_mem} -cp #{genv.msgfplusjar} edu.ucsd.msjava.msdbsearch.BuildSA -d #{database_path} -tda #{tdavalue}; "
101
103
  end
102
104
 
103
105
 
@@ -262,4 +264,4 @@ ARGV.each do |filename|
262
264
  #
263
265
  make_msgfdb_cmd=""
264
266
 
265
- end
267
+ end
@@ -54,6 +54,19 @@ else
54
54
  output_file=Tool.default_output_path(inputs,".prot.xml",prophet_tool.output_prefix,@output_suffix)
55
55
  end
56
56
 
57
+ genv.log("Checking input files ...",:info)
58
+ inputs.each {|file_name|
59
+
60
+ throw "Missing input file #{file_name}" unless File.exist?(file_name)
61
+
62
+ file_pepxml = PepXML.new(file_name)
63
+
64
+ db_path=file_pepxml.extract_db()
65
+ throw "Unable to find database #{db_path} used for searching. Fix paths in input files first" unless File.exist?(db_path)
66
+ }
67
+
68
+
69
+
57
70
  if ( !Pathname.new(output_file).exist? || prophet_tool.over_write )
58
71
 
59
72
  cmd="ProteinProphet "
@@ -38,7 +38,7 @@ end
38
38
 
39
39
  def protein_id_to_gffid(protein_id,gff_idregex)
40
40
  return protein_id if gff_idregex.nil?
41
-
41
+ # require 'byebug'; byebug
42
42
  m = protein_id.match(/#{gff_idregex}/)
43
43
  if m
44
44
  return m.captures[0]
@@ -77,7 +77,9 @@ def prepare_fasta(database_path,type)
77
77
  end
78
78
 
79
79
 
80
- db_indexfilename = type=='prot' ? "#{db_filename}.pin" : "#{db_filename}.nhr"
80
+ db_indexfilename = type=='prot' ? "#{db_filename}.00.pin" : "#{db_filename}.nhr"
81
+
82
+ # require 'byebug';byebug
81
83
 
82
84
  if File.exist?(db_indexfilename)
83
85
  orf_lookup = FastaDB.new(db_filename)
@@ -101,6 +103,7 @@ tool.add_value_option(:protein_probability_threshold,0.99,['--prot-threshold pro
101
103
  tool.add_value_option(:gff_idregex,nil,['--gff-idregex pre','Regex with capture group for parsing gff ids from protein ids'])
102
104
  tool.add_value_option(:genome_idregex,nil,['--genome-idregex pre','Regex with capture group for parsing genomic ids from protein ids'])
103
105
  tool.add_value_option(:ignore_regex,nil,['--ignore-regex pre','Regex to match protein ids that we should ignore completely'])
106
+ tool.add_value_option(:include_mods,false,['--include-mods','Output gff entries for peptide modification sites'])
104
107
 
105
108
  exit unless tool.check_options(true,[:database,:coords_file])
106
109
 
@@ -170,6 +173,12 @@ proteins.each do |protein|
170
173
  peptide_entries.each do |peptide_entry|
171
174
  output_fh.write peptide_entry.to_s
172
175
  end
176
+ if tool.include_mods
177
+ mod_entries = peptide.mods_to_gff3_records(protein_entry.aaseq,gff_parent_entry,gff_cds_entries)
178
+ mod_entries.each do |mod_entry|
179
+ output_fh.write mod_entry.to_s
180
+ end
181
+ end
173
182
  end
174
183
  end
175
184
 
@@ -64,7 +64,12 @@ file.each do |entry|
64
64
  length = entry.naseq.length
65
65
 
66
66
  (1...7).each do |frame|
67
- translated_seq= entry.naseq.translate(frame)
67
+ begin
68
+ translated_seq= entry.naseq.translate(frame)
69
+ rescue => exception
70
+ puts "#{entry}"
71
+ exit 1
72
+ end
68
73
  orfs=translated_seq.split("*")
69
74
  orf_index = 0
70
75
  position = ((frame - 1) % 3) + 1
@@ -22,8 +22,10 @@ exit unless search_tool.check_options(true)
22
22
 
23
23
  # Our environment should be setup so that tandem or tandem.exe is on the path
24
24
  #
25
- tandem_bin=%x[which tandem].chomp
26
- tandem_bin=%x[which tandem.exe].chomp unless tandem_bin && tandem_bin.length>0
25
+ # tandem_bin=%x[which tandem].chomp
26
+ # tandem_bin=%x[which tandem.exe].chomp unless tandem_bin && tandem_bin.length>0
27
+
28
+ # tandem_bin
27
29
 
28
30
  @output_suffix="_tandem"
29
31
 
@@ -70,7 +72,7 @@ ARGV.each do |filename|
70
72
 
71
73
  # The basic command
72
74
  #
73
- cmd= "#{tandem_bin} #{params_path}"
75
+ cmd= "#{genv.tandem_bin} #{params_path}"
74
76
 
75
77
  # Add a cleanup command unless the user wants to keep params files
76
78
  #
@@ -63,6 +63,18 @@ class Constants
63
63
  "#{@protk_dir}/tools/msgfplus"
64
64
  end
65
65
 
66
+ def get_path_for_executable(exec_name_list)
67
+ exec_name_list.each do |exec_name|
68
+ exec_path=%x[which #{exec_name}].chomp
69
+ return exec_path unless !exec_path || exec_path.length==0
70
+ end
71
+ throw "Unable to locate #{exec_name_list}"
72
+ end
73
+
74
+ def tandem_bin
75
+ get_path_for_executable ["tandem","tandem.exe"]
76
+ end
77
+
66
78
  def msgfplusjar
67
79
  msgfplus_path=%x[which MSGFPlus.jar]
68
80
  msgfplus_path.chomp
@@ -0,0 +1,56 @@
1
+ #
2
+ # This file is part of protk
3
+ # Created by Ira Cooke 9/3/2017
4
+ #
5
+ # Provides common functionality used by tools that convert gff to a protein database
6
+ #
7
+ # These tools read a gff and then write out protein entries in the following format
8
+ #
9
+ # >lcl|<scaffold_id>_<orientation>_<transcript_id> gene_start|gene_end cds1_start|cds1_end cds2_start|cds2_end ...
10
+ #
11
+
12
+ require 'optparse'
13
+ require 'pathname'
14
+ require 'protk/tool'
15
+
16
+ class GffToProteinDBTool < Tool
17
+
18
+ attr_accessor :print_progress
19
+
20
+ # Initializes commandline options common to all such tools.
21
+ # Individual search tools can add their own options, but should use Capital letters to avoid conflicts
22
+ #
23
+ def initialize(option_support=[])
24
+ super(option_support)
25
+
26
+ if ( option_support.include? :add_transcript_info )
27
+ add_boolean_option(:add_transcript_info,false,['--info','Include CDS Coordinates'])
28
+ end
29
+
30
+ @option_parser.summary_width=40
31
+
32
+ @capturing_gene=false
33
+ @current_gene=nil
34
+ end
35
+
36
+ def start_new_gene(line)
37
+ if (line =~ /maker\sgene/)
38
+ new_gene = line.match(/ID=([^;]+)/).captures[0]
39
+ if new_gene!=@current_gene
40
+ @current_gene=new_gene
41
+ return true
42
+ end
43
+ end
44
+ end
45
+
46
+ def get_lines_matching(pattern,gene_lines)
47
+ match_lines=[]
48
+ gene_lines.each do |line|
49
+ if line =~ pattern
50
+ match_lines << line
51
+ end
52
+ end
53
+ match_lines
54
+ end
55
+
56
+ end
@@ -3,12 +3,15 @@ require 'bio'
3
3
  require 'protk/bio_gff3_extensions'
4
4
  require 'protk/mzidentml_doc'
5
5
  require 'protk/error'
6
+ require 'protk/peptide_mod'
7
+ # require 'protk/indistinguishable_peptide'
6
8
 
7
9
  include LibXML
8
10
 
9
11
  class PeptideNotInProteinError < ProtkError
10
12
  end
11
13
 
14
+
12
15
  class Peptide
13
16
 
14
17
  # Stripped sequence (no modifications)
@@ -17,6 +20,9 @@ class Peptide
17
20
  attr_accessor :charge
18
21
  attr_accessor :probability
19
22
  attr_accessor :theoretical_neutral_mass
23
+ attr_accessor :modifications
24
+ attr_accessor :modified_sequence
25
+ attr_accessor :indistinguishable_peptides
20
26
 
21
27
  def as_protxml
22
28
  node = XML::Node.new('peptide')
@@ -33,6 +39,27 @@ class Peptide
33
39
  pep.sequence=xmlnode['peptide_sequence']
34
40
  pep.probability=xmlnode['nsp_adjusted_probability'].to_f
35
41
  pep.charge=xmlnode['charge'].to_i
42
+
43
+ # This deal with the case where mods are on the primary peptide
44
+ #
45
+ mod_info_node = xmlnode.find('protxml:modification_info','protxml:http://regis-web.systemsbiology.net/protXML')
46
+
47
+ # The pepXML spec says there can be multiple modification_info's but in practice there never is.
48
+ # We assume either 1 or 0
49
+ if ( mod_info_node.length > 0 )
50
+ throw "Encountered multiple modification_info nodes for a peptide" if mod_info_node.length > 1
51
+ pep.modified_sequence = mod_info_node[0]['modified_peptide']
52
+ mod_nodes = mod_info_node[0].find('protxml:mod_aminoacid_mass','protxml:http://regis-web.systemsbiology.net/protXML')
53
+ # require 'byebug';byebug
54
+ pep.modifications = mod_nodes.collect { |e| PeptideMod.from_protxml(e) }
55
+ end
56
+
57
+ # This deals with indistinguishable peptides
58
+ #
59
+ ips = xmlnode.find('protxml:indistinguishable_peptide','protxml:http://regis-web.systemsbiology.net/protXML')
60
+ # require 'byebug';byebug
61
+ pep.indistinguishable_peptides = ips.collect { |e| IndistinguishablePeptide.from_protxml(e) }
62
+
36
63
  pep
37
64
  end
38
65
 
@@ -55,17 +82,23 @@ class Peptide
55
82
  pep.charge = best_psm.attributes['chargeState'].to_i
56
83
  pep.protein_name = mzid_doc.get_dbsequence(xmlnode.parent,xmlnode.parent.attributes['dBSequence_ref']).attributes['accession']
57
84
 
58
- # pep.charge = MzIdentMLDoc.get_charge_for_psm(best_psm)
59
85
 
60
86
  pep
61
87
  end
62
88
 
63
89
  def from_sequence(seq,charge=nil)
64
90
  pep=new()
65
- pep.sequence=seq
91
+
92
+ pep.modifications = pep.modifications_from_sequence(seq)
93
+ pep.modified_sequence = seq
94
+
95
+ seq = seq.sub(/^n\[[0-9]+?\]/,"")
96
+ pep.sequence = seq.gsub(/[0-9\.\[\]]/,"")
66
97
  pep.charge=charge
67
98
  pep
68
99
  end
100
+
101
+
69
102
  private :new
70
103
  end
71
104
 
@@ -73,6 +106,26 @@ class Peptide
73
106
 
74
107
  end
75
108
 
109
+ def modifications_from_sequence(seq)
110
+
111
+ seq = seq.sub(/^n\[[0-9]+?\]/,"")
112
+ offset = 0
113
+ mods = seq.enum_for(:scan, /([A-Z])\[([0-9\.]+)\]/).map {
114
+ pm = PeptideMod.from_data(Regexp.last_match.begin(0)+1-offset,Regexp.last_match.captures[0],Regexp.last_match.captures[1].to_f)
115
+ offset += Regexp.last_match.captures[1].length+2
116
+ pm
117
+ }
118
+
119
+ # if ( seq == "N[115]VMN[115]LTPAETQ[129]QLHAALESQLSPGELAK" )
120
+ # require 'byebug';byebug
121
+ # puts "hi"
122
+ # end
123
+
124
+
125
+ mods
126
+ end
127
+
128
+
76
129
  # Expects prot_seq not to contain explicit stop codon (ie * at end)
77
130
  # AA coords are 0-based unlike genomic coords which are 1 based
78
131
  #
@@ -112,23 +165,61 @@ class Peptide
112
165
  throw "Expected GFF3 Record but got #{parent_record.class}" unless parent_record.class==Bio::GFF::GFF3::Record
113
166
  throw "Expected Array but got #{cds_records.class}" unless cds_records.class==Array
114
167
 
115
- on_reverse_strand = (parent_record.strand=="-") ? true : false
116
168
  aa_coords = coords_in_protein(prot_seq,false) # Always use forward protein coordinates
117
169
 
170
+ gff_records_for_coords_in_protein(aa_coords,self.sequence.length,parent_record,cds_records)
171
+ end
172
+
173
+ def mods_to_gff3_records(prot_seq,parent_record,cds_records)
174
+
175
+ throw "Expected GFF3 Record but got #{parent_record.class}" unless parent_record.class==Bio::GFF::GFF3::Record
176
+ throw "Expected Array but got #{cds_records.class}" unless cds_records.class==Array
177
+
178
+ pep_aa_coords = coords_in_protein(prot_seq,false)
179
+
180
+ mod_records = []
181
+
182
+ unless ( self.modifications.nil? )
183
+ self.modifications.each { |mod|
184
+ prot_position = mod.position+pep_aa_coords[:start]
185
+ mod_aa_coords = {:start => prot_position, :end => prot_position+1}
186
+ mod_records << gff_records_for_coords_in_protein(mod_aa_coords,1,parent_record,cds_records, {:type => "modified_amino_acid_feature", :mod => mod, :modified_sequence => self.modified_sequence})
187
+ }
188
+ end
189
+
190
+ unless ( self.indistinguishable_peptides.nil? )
191
+ self.indistinguishable_peptides.each { |ip|
192
+ unless ( ip.modifications.nil? )
193
+ ip.modifications.each { |mod|
194
+ prot_position = mod.position+pep_aa_coords[:start]-1
195
+ mod_aa_coords = {:start => prot_position, :end => prot_position+1}
196
+ mod_records << gff_records_for_coords_in_protein(mod_aa_coords,1,parent_record,cds_records, {:type => "modified_amino_acid_feature", :mod => mod, :modified_sequence => ip.modified_sequence})
197
+ }
198
+ end
199
+ }
200
+ end
201
+
202
+ mod_records.flatten
203
+
204
+ end
205
+
206
+
207
+ def gff_records_for_coords_in_protein(aa_coords,seqlen,parent_record,cds_records,record_info ={:type => "polypeptide"})
208
+ on_reverse_strand = (parent_record.strand=="-") ? true : false
118
209
  ordered_cds_records = on_reverse_strand ? cds_records.sort.reverse : cds_records.sort
119
210
 
120
211
  # Initial position is the number of NA's from the start of translation
121
212
  #
122
- pep_nalen = self.sequence.length*3
213
+ pep_nalen = seqlen*3
123
214
 
124
215
  i = 0; #Current protein position (in nucleic acids)
125
216
 
126
217
  pep_start_i = aa_coords[:start]*3
127
- pep_end_i = pep_start_i+self.sequence.length*3
128
- fragments=[]
218
+ pep_end_i = pep_start_i+seqlen*3
219
+ gff_records=[]
129
220
  ordered_cds_records.each do |cds_record|
130
221
 
131
- fragment = nil
222
+ gff_record = nil
132
223
  fragment_len = 0
133
224
  if on_reverse_strand
134
225
 
@@ -139,16 +230,16 @@ class Peptide
139
230
  fragment_end = cds_record.end
140
231
  fragment_len = [cds_record.length,pep_end_i-i].min
141
232
  fragment_start = fragment_end-fragment_len+1
142
- fragment = gff_record_for_peptide_fragment(fragment_start,fragment_end,cds_record)
233
+ gff_record = gff_record_for_peptide_fragment(fragment_start,fragment_end,cds_record,record_info)
143
234
  elsif before_len>0
144
235
  fragment_end = cds_record.end - before_len
145
236
  fragment_len = [cds_record.length-before_len,pep_end_i-i-before_len].min
146
237
  fragment_start = fragment_end - fragment_len + 1
147
238
  if fragment_len>0
148
- fragment = gff_record_for_peptide_fragment(fragment_start,fragment_end,cds_record)
239
+ gff_record = gff_record_for_peptide_fragment(fragment_start,fragment_end,cds_record,record_info)
149
240
  end
150
241
  else
151
- fragment=nil
242
+ gff_record=nil
152
243
  end
153
244
  else
154
245
  in_peptide = (i<pep_end_i) && (i>=pep_start_i)
@@ -157,33 +248,64 @@ class Peptide
157
248
  fragment_start = cds_record.start
158
249
  fragment_len = [cds_record.length,pep_end_i-i].min
159
250
  fragment_end = fragment_start+fragment_len-1
160
- fragment = gff_record_for_peptide_fragment(fragment_start,fragment_end,cds_record)
251
+ gff_record = gff_record_for_peptide_fragment(fragment_start,fragment_end,cds_record,record_info)
161
252
  elsif before_len>0
162
253
  fragment_start = cds_record.start + before_len
163
254
  fragment_len = [cds_record.length-before_len,pep_end_i-i-before_len].min
164
255
  fragment_end = fragment_start + fragment_len-1
165
256
  if fragment_len>0
166
- fragment = gff_record_for_peptide_fragment(fragment_start,fragment_end,cds_record)
257
+ gff_record = gff_record_for_peptide_fragment(fragment_start,fragment_end,cds_record,record_info)
167
258
  end
168
259
  else
169
- fragment=nil
260
+ gff_record = nil
170
261
  end
171
262
 
172
263
  end
173
264
  i+=cds_record.length
174
- fragments << fragment unless fragment.nil?
265
+ gff_records << gff_record unless gff_record.nil?
175
266
  end
176
- fragments
267
+ gff_records
177
268
  end
178
269
 
179
- def gff_record_for_peptide_fragment(start_i,end_i,parent_record)
270
+ def gff_record_for_peptide_fragment(start_i,end_i,parent_record,record_info)
180
271
  cds_id = parent_record.id
181
- this_id = "#{cds_id}.#{self.sequence}"
272
+ mod_sequence = record_info[:modified_sequence]
273
+ this_id = mod_sequence ? "#{cds_id}.#{mod_sequence}" : "#{cds_id}.#{self.sequence}"
182
274
  this_id << ".#{self.charge}" unless self.charge.nil?
275
+ mod = record_info[:mod]
276
+ this_id << ".#{mod.position}.#{mod.mass}" unless mod.nil?
183
277
  score = self.probability.nil? ? "." : self.probability.to_s
184
- gff_string = "#{parent_record.seqid}\tMSMS\tpolypeptide\t#{start_i}\t#{end_i}\t#{score}\t#{parent_record.strand}\t0\tID=#{this_id};Parent=#{cds_id}"
278
+ record_type = mod.nil? ? record_info[:type] : "#{record_info[:type]}_#{mod.amino_acid}"
279
+ gff_string = "#{parent_record.seqid}\tMSMS\t#{record_type}\t#{start_i}\t#{end_i}\t#{score}\t#{parent_record.strand}\t0\tID=#{this_id};Parent=#{cds_id}"
185
280
  Bio::GFF::GFF3::Record.new(gff_string)
186
281
  end
187
282
 
283
+ end
284
+
188
285
 
189
- end
286
+ # <indistinguishable_peptide peptide_sequence="MEYENTLTAAMK" charge="2" calc_neutral_pep_mass="1416.63">
287
+ # <modification_info modified_peptide="M[147]EYENTLTAAMK"/>
288
+ # </indistinguishable_peptide>
289
+ class IndistinguishablePeptide < Peptide
290
+ class << self
291
+ def from_protxml(xmlnode)
292
+ pep=new()
293
+ pep.sequence=xmlnode['peptide_sequence']
294
+ pep.charge=xmlnode['charge'].to_i
295
+
296
+ mod_info_node = xmlnode.find('protxml:modification_info','protxml:http://regis-web.systemsbiology.net/protXML')
297
+
298
+ if ( mod_info_node.length > 0 )
299
+ throw "Encountered multiple modification_info nodes for an indistinguishable peptide" if mod_info_node.length > 1
300
+ pep.modified_sequence = mod_info_node[0]['modified_peptide']
301
+ mod_nodes = mod_info_node[0].find('protxml:mod_aminoacid_mass','protxml:http://regis-web.systemsbiology.net/protXML')
302
+ if ( mod_nodes.length > 0 )
303
+ pep.modifications = mod_nodes.collect { |e| PeptideMod.from_protxml(e) }
304
+ else
305
+ pep.modifications = pep.modifications_from_sequence(pep.modified_sequence)
306
+ end
307
+ end
308
+ pep
309
+ end
310
+ end
311
+ end
@@ -0,0 +1,42 @@
1
+ require 'libxml'
2
+ require 'bio'
3
+
4
+ include LibXML
5
+
6
+ class PeptideMod
7
+
8
+ # Fully Modified Sequence
9
+ attr_accessor :position
10
+ attr_accessor :amino_acid
11
+ attr_accessor :mass
12
+
13
+ class << self
14
+
15
+ # <modification_info modified_peptide="GFGFVTYSC[160]VEEVDAAMC[160]ARPHK">
16
+ # <mod_aminoacid_mass position="9" mass="160.030600"/>
17
+ # <mod_aminoacid_mass position="18" mass="160.030600"/>
18
+ # </modification_info>
19
+
20
+ def from_protxml(xmlnode)
21
+ pepmod = new()
22
+ pepmod.position=xmlnode['position'].to_i
23
+ pepmod.mass=xmlnode['mass'].to_f
24
+ pepmod
25
+ end
26
+
27
+ def from_data(position,amino_acid,mass)
28
+ pepmod = new()
29
+ pepmod.position = position
30
+ pepmod.amino_acid = amino_acid
31
+ pepmod.mass = mass
32
+ pepmod
33
+ end
34
+
35
+ private :new
36
+ end
37
+
38
+ def initialize()
39
+
40
+ end
41
+
42
+ end
@@ -153,6 +153,12 @@ file tpp_installed_file => [@build_dir,tpp_download_file] do
153
153
 
154
154
  sh %{cd #{@build_dir};tar -xvzf TPP-#{tpp_version}.tgz}
155
155
 
156
+ sh %{cp ~/Desktop/singleton.hpp #{@build_dir}/TPP-#{tpp_version}/trans_proteomic_pipeline/src/../extern/ProteoWizard/pwiz/libraries/boost_aux/boost/utility/singleton.hpp}
157
+
158
+ sh %{cp ~/Desktop/MascotScoreParser.h #{@build_dir}/TPP-#{tpp_version}/trans_proteomic_pipeline/src/Validation/DiscriminateFunction/Mascot/MascotScoreParser.h}
159
+ sh %{cp ~/Desktop/PTMProphetParser.cxx #{@build_dir}/TPP-#{tpp_version}/trans_proteomic_pipeline/src/Validation/PTMProphetParser/PTMProphetParser.cxx}
160
+ sh %{cp ~/Desktop/RespectFilter.h #{@build_dir}/TPP-#{tpp_version}/trans_proteomic_pipeline/src/Validation/Respect/RespectFilter.h}
161
+
156
162
  File.open("#{@build_dir}/TPP-#{tpp_version}/trans_proteomic_pipeline/src/Makefile.config.incl","wb") do |f|
157
163
  f.write "TPP_ROOT=#{env.tpp_root}/\nTPP_WEB=/tpp/\nXSLT_PROC=/usr/bin/xsltproc\nCGI_USERS_DIR=${TPP_ROOT}cgi-bin/"
158
164
  end
@@ -173,13 +179,18 @@ file tpp_installed_file => [@build_dir,tpp_download_file] do
173
179
  makefile_text = File.read("#{makefile_path}")
174
180
 
175
181
  File.open("#{makefile_path}","w+") do |f|
176
- subs_text = makefile_text.gsub("cp -rfu","cp -rf")
182
+ subs_text = makefile_text.gsub("cp -rfu","cp -rf").gsub("-Werror","")
183
+ # subs_text = subs_text.gsub("-Werror","")
177
184
  f.write subs_text
178
185
  end
186
+
187
+
188
+
179
189
  end
180
190
  sh %{cd #{@build_dir}/TPP-#{tpp_version}/trans_proteomic_pipeline/src;echo '' > ../perl/tpp_models.pl;echo '' > ../perl/exporTPP.pl;echo '' > ../CGI/show_nspbin.pl;echo '' > ../CGI/tpp_gui/tpp_gui.pl}
181
191
 
182
- build_cmd = "#{use_perl_locallib_cmd};cd #{@build_dir}/TPP-#{tpp_version}/trans_proteomic_pipeline/src ; make -s"
192
+ # build_cmd = "#{use_perl_locallib_cmd};cd #{@build_dir}/TPP-#{tpp_version}/trans_proteomic_pipeline/src ; make -s"
193
+ build_cmd = "#{use_perl_locallib_cmd};cd #{@build_dir}/TPP-#{tpp_version}/trans_proteomic_pipeline/src ; make"
183
194
  install_cmd = "#{use_perl_locallib_cmd};cd #{@build_dir}/TPP-#{tpp_version}/trans_proteomic_pipeline/src ; make install"
184
195
  env.log build_cmd, :info
185
196
  sh %{#{build_cmd}}
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: protk
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.4.4.beta2
4
+ version: 1.4.4
5
5
  platform: ruby
6
6
  authors:
7
7
  - Ira Cooke
@@ -14,170 +14,170 @@ dependencies:
14
14
  name: open4
15
15
  requirement: !ruby/object:Gem::Requirement
16
16
  requirements:
17
- - - ~>
17
+ - - "~>"
18
18
  - !ruby/object:Gem::Version
19
19
  version: '1.3'
20
- - - '>='
20
+ - - ">="
21
21
  - !ruby/object:Gem::Version
22
22
  version: 1.3.0
23
23
  type: :runtime
24
24
  prerelease: false
25
25
  version_requirements: !ruby/object:Gem::Requirement
26
26
  requirements:
27
- - - ~>
27
+ - - "~>"
28
28
  - !ruby/object:Gem::Version
29
29
  version: '1.3'
30
- - - '>='
30
+ - - ">="
31
31
  - !ruby/object:Gem::Version
32
32
  version: 1.3.0
33
33
  - !ruby/object:Gem::Dependency
34
34
  name: bio
35
35
  requirement: !ruby/object:Gem::Requirement
36
36
  requirements:
37
- - - ~>
37
+ - - "~>"
38
38
  - !ruby/object:Gem::Version
39
39
  version: 1.4.3
40
- - - '>='
40
+ - - ">="
41
41
  - !ruby/object:Gem::Version
42
42
  version: 1.4.3
43
43
  type: :runtime
44
44
  prerelease: false
45
45
  version_requirements: !ruby/object:Gem::Requirement
46
46
  requirements:
47
- - - ~>
47
+ - - "~>"
48
48
  - !ruby/object:Gem::Version
49
49
  version: 1.4.3
50
- - - '>='
50
+ - - ">="
51
51
  - !ruby/object:Gem::Version
52
52
  version: 1.4.3
53
53
  - !ruby/object:Gem::Dependency
54
54
  name: rest-client
55
55
  requirement: !ruby/object:Gem::Requirement
56
56
  requirements:
57
- - - ~>
57
+ - - "~>"
58
58
  - !ruby/object:Gem::Version
59
59
  version: 1.6.7
60
- - - '>='
60
+ - - ">="
61
61
  - !ruby/object:Gem::Version
62
62
  version: 1.6.7
63
63
  type: :runtime
64
64
  prerelease: false
65
65
  version_requirements: !ruby/object:Gem::Requirement
66
66
  requirements:
67
- - - ~>
67
+ - - "~>"
68
68
  - !ruby/object:Gem::Version
69
69
  version: 1.6.7
70
- - - '>='
70
+ - - ">="
71
71
  - !ruby/object:Gem::Version
72
72
  version: 1.6.7
73
73
  - !ruby/object:Gem::Dependency
74
74
  name: net-ftp-list
75
75
  requirement: !ruby/object:Gem::Requirement
76
76
  requirements:
77
- - - ~>
77
+ - - "~>"
78
78
  - !ruby/object:Gem::Version
79
79
  version: 3.2.5
80
- - - '>='
80
+ - - ">="
81
81
  - !ruby/object:Gem::Version
82
82
  version: 3.2.5
83
83
  type: :runtime
84
84
  prerelease: false
85
85
  version_requirements: !ruby/object:Gem::Requirement
86
86
  requirements:
87
- - - ~>
87
+ - - "~>"
88
88
  - !ruby/object:Gem::Version
89
89
  version: 3.2.5
90
- - - '>='
90
+ - - ">="
91
91
  - !ruby/object:Gem::Version
92
92
  version: 3.2.5
93
93
  - !ruby/object:Gem::Dependency
94
94
  name: libxml-ruby
95
95
  requirement: !ruby/object:Gem::Requirement
96
96
  requirements:
97
- - - ~>
97
+ - - "~>"
98
98
  - !ruby/object:Gem::Version
99
- version: '2.7'
100
- - - '>='
99
+ version: '2.9'
100
+ - - ">="
101
101
  - !ruby/object:Gem::Version
102
- version: 2.7.0
102
+ version: 2.9.0
103
103
  type: :runtime
104
104
  prerelease: false
105
105
  version_requirements: !ruby/object:Gem::Requirement
106
106
  requirements:
107
- - - ~>
107
+ - - "~>"
108
108
  - !ruby/object:Gem::Version
109
- version: '2.7'
110
- - - '>='
109
+ version: '2.9'
110
+ - - ">="
111
111
  - !ruby/object:Gem::Version
112
- version: 2.7.0
112
+ version: 2.9.0
113
113
  - !ruby/object:Gem::Dependency
114
114
  name: rspec
115
115
  requirement: !ruby/object:Gem::Requirement
116
116
  requirements:
117
- - - ~>
117
+ - - "~>"
118
118
  - !ruby/object:Gem::Version
119
119
  version: '3.0'
120
120
  type: :development
121
121
  prerelease: false
122
122
  version_requirements: !ruby/object:Gem::Requirement
123
123
  requirements:
124
- - - ~>
124
+ - - "~>"
125
125
  - !ruby/object:Gem::Version
126
126
  version: '3.0'
127
127
  - !ruby/object:Gem::Dependency
128
128
  name: rspec-mocks
129
129
  requirement: !ruby/object:Gem::Requirement
130
130
  requirements:
131
- - - ~>
131
+ - - "~>"
132
132
  - !ruby/object:Gem::Version
133
133
  version: '3.0'
134
134
  type: :development
135
135
  prerelease: false
136
136
  version_requirements: !ruby/object:Gem::Requirement
137
137
  requirements:
138
- - - ~>
138
+ - - "~>"
139
139
  - !ruby/object:Gem::Version
140
140
  version: '3.0'
141
141
  - !ruby/object:Gem::Dependency
142
142
  name: rake-compiler
143
143
  requirement: !ruby/object:Gem::Requirement
144
144
  requirements:
145
- - - ~>
145
+ - - "~>"
146
146
  - !ruby/object:Gem::Version
147
147
  version: '0'
148
148
  type: :development
149
149
  prerelease: false
150
150
  version_requirements: !ruby/object:Gem::Requirement
151
151
  requirements:
152
- - - ~>
152
+ - - "~>"
153
153
  - !ruby/object:Gem::Version
154
154
  version: '0'
155
155
  - !ruby/object:Gem::Dependency
156
156
  name: byebug
157
157
  requirement: !ruby/object:Gem::Requirement
158
158
  requirements:
159
- - - ~>
159
+ - - "~>"
160
160
  - !ruby/object:Gem::Version
161
161
  version: '3.5'
162
162
  type: :development
163
163
  prerelease: false
164
164
  version_requirements: !ruby/object:Gem::Requirement
165
165
  requirements:
166
- - - ~>
166
+ - - "~>"
167
167
  - !ruby/object:Gem::Version
168
168
  version: '3.5'
169
169
  - !ruby/object:Gem::Dependency
170
170
  name: sqlite3
171
171
  requirement: !ruby/object:Gem::Requirement
172
172
  requirements:
173
- - - ~>
173
+ - - "~>"
174
174
  - !ruby/object:Gem::Version
175
175
  version: '0'
176
176
  type: :runtime
177
177
  prerelease: false
178
178
  version_requirements: !ruby/object:Gem::Requirement
179
179
  requirements:
180
- - - ~>
180
+ - - "~>"
181
181
  - !ruby/object:Gem::Version
182
182
  version: '0'
183
183
  description: Commandline tools for proteomics
@@ -202,6 +202,7 @@ executables:
202
202
  - uniprot_mapper.rb
203
203
  - sixframe.rb
204
204
  - augustus_to_proteindb.rb
205
+ - maker_to_proteindb.rb
205
206
  - protxml_to_gff.rb
206
207
  - protxml_to_table.rb
207
208
  - swissprot_to_table.rb
@@ -222,6 +223,7 @@ files:
222
223
  - bin/filter_psms.rb
223
224
  - bin/interprophet.rb
224
225
  - bin/make_decoy.rb
226
+ - bin/maker_to_proteindb.rb
225
227
  - bin/manage_db.rb
226
228
  - bin/mascot_search.rb
227
229
  - bin/mascot_to_pepxml.rb
@@ -283,6 +285,7 @@ files:
283
285
  - lib/protk/fastadb.rb
284
286
  - lib/protk/galaxy_stager.rb
285
287
  - lib/protk/galaxy_util.rb
288
+ - lib/protk/gff_to_proteindb_tool.rb
286
289
  - lib/protk/gffdb.rb
287
290
  - lib/protk/manage_db_rakefile.rake
288
291
  - lib/protk/manage_db_tool.rb
@@ -292,6 +295,7 @@ files:
292
295
  - lib/protk/omssa_util.rb
293
296
  - lib/protk/openms_defaults.rb
294
297
  - lib/protk/peptide.rb
298
+ - lib/protk/peptide_mod.rb
295
299
  - lib/protk/pepxml.rb
296
300
  - lib/protk/pepxml_writer.rb
297
301
  - lib/protk/physical_constants.rb
@@ -324,19 +328,18 @@ require_paths:
324
328
  - lib
325
329
  required_ruby_version: !ruby/object:Gem::Requirement
326
330
  requirements:
327
- - - '>='
331
+ - - ">="
328
332
  - !ruby/object:Gem::Version
329
333
  version: '0'
330
334
  required_rubygems_version: !ruby/object:Gem::Requirement
331
335
  requirements:
332
- - - '>'
336
+ - - ">="
333
337
  - !ruby/object:Gem::Version
334
- version: 1.3.1
338
+ version: '0'
335
339
  requirements: []
336
340
  rubyforge_project:
337
- rubygems_version: 2.2.1
341
+ rubygems_version: 2.5.1
338
342
  signing_key:
339
343
  specification_version: 4
340
344
  summary: Proteomics Toolkit
341
345
  test_files: []
342
- has_rdoc: