protk 1.2.4 → 1.2.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/bin/add_retention_times.rb +89 -0
- data/bin/augustus_to_proteindb.rb +193 -0
- data/bin/blastxml_to_table.rb +72 -0
- data/bin/feature_finder.rb +7 -1
- data/bin/make_decoy.rb +10 -2
- data/bin/mascot_search.rb +14 -4
- data/bin/msgfplus_search.rb +14 -5
- data/bin/peptide_prophet.rb +14 -7
- data/bin/protxml_to_gff.rb +624 -0
- data/bin/protxml_to_table.rb +19 -2
- data/bin/sixframe.rb +3 -1
- data/bin/tandem_search.rb +51 -23
- data/bin/toppas_pipeline.rb +8 -3
- data/bin/uniprot_annotation.rb +6 -1
- data/ext/protk/{protk.c → decoymaker/decoymaker.c} +13 -15
- data/ext/protk/decoymaker/extconf.rb +3 -0
- data/ext/protk/simplealign/extconf.rb +3 -0
- data/lib/protk/data/FeatureFinderIsotopeWavelet.ini +6 -6
- data/lib/protk/gapped_aligner.rb +264 -0
- data/lib/protk/manage_db_rakefile.rake +2 -1
- data/lib/protk/mascot_util.rb +7 -2
- data/lib/protk/randomize.rb +2 -2
- data/lib/protk/search_tool.rb +1 -1
- data/lib/protk/setup_rakefile.rake +25 -2
- data/lib/protk/spreadsheet_extensions.rb +1 -0
- data/lib/protk/swissprot_database.rb +11 -1
- metadata +30 -8
- data/bin/mascot2xml.rb +0 -87
- data/ext/protk/extconf.rb +0 -3
- data/lib/protk/data/pepxml_mascot_template.xml +0 -29
- data/lib/protk/data/predefined_db.trembl_annotation.yaml +0 -20
data/bin/msgfplus_search.rb
CHANGED
@@ -17,9 +17,10 @@ input_stager = nil
|
|
17
17
|
|
18
18
|
# Setup specific command-line options for this tool. Other options are inherited from SearchTool
|
19
19
|
#
|
20
|
-
search_tool=SearchTool.new([:database,:explicit_output,:over_write,:enzyme,
|
20
|
+
search_tool=SearchTool.new([:background,:database,:explicit_output,:over_write,:enzyme,
|
21
21
|
:modifications,:instrument,:mass_tolerance_units,:mass_tolerance,:missed_cleavages])
|
22
22
|
|
23
|
+
search_tool.jobid_prefix="p"
|
23
24
|
search_tool.option_parser.banner = "Run an MSGFPlus msms search on a set of msms spectrum input files.\n\nUsage: msgfplus_search.rb [options] file1.mzML file2.mzML ..."
|
24
25
|
search_tool.options.output_suffix="_msgfplus"
|
25
26
|
|
@@ -135,7 +136,7 @@ ARGV.each do |filename|
|
|
135
136
|
if ( search_tool.explicit_output!=nil)
|
136
137
|
output_path=search_tool.explicit_output
|
137
138
|
else
|
138
|
-
output_path="#{search_tool.output_base_path(filename.chomp)}.
|
139
|
+
output_path="#{search_tool.output_base_path(filename.chomp)}.pep.xml"
|
139
140
|
end
|
140
141
|
|
141
142
|
|
@@ -232,20 +233,28 @@ ARGV.each do |filename|
|
|
232
233
|
# As a final part of the command we convert to pepxml
|
233
234
|
if search_tool.no_pepxml
|
234
235
|
cmd << "; cp #{mzid_output_path} #{output_path}"
|
235
|
-
|
236
|
+
else
|
237
|
+
#if search_tool.explicit_output
|
236
238
|
cmd << "; #{genv.idconvert} #{mzid_output_path} --pepXML -o #{Pathname.new(mzid_output_path).dirname}"
|
237
239
|
#Then copy the pepxml to the final output path
|
238
|
-
cmd << ";
|
240
|
+
cmd << "; mv #{mzid_output_path.chomp('.mzid')}.pepXML #{output_path}"
|
239
241
|
end
|
240
242
|
|
241
243
|
|
242
244
|
# Up to here we've formulated the command. The rest is cleanup
|
243
245
|
p "Running:#{cmd}"
|
244
246
|
|
247
|
+
# In case the user specified background running we need to create a jobscript path
|
248
|
+
#
|
249
|
+
jobscript_path="#{output_path}.pbs.sh"
|
250
|
+
|
245
251
|
# Run the search
|
246
252
|
#
|
247
253
|
job_params= {:jobid => search_tool.jobid_from_filename(filename) }
|
248
|
-
|
254
|
+
job_params[:queue]="seventytwo"
|
255
|
+
job_params[:vmem]="70gb"
|
256
|
+
code = search_tool.run(cmd,genv,job_params,jobscript_path)
|
257
|
+
throw "Command failed with exit code #{code}" unless code==0
|
249
258
|
|
250
259
|
if for_galaxy
|
251
260
|
input_stager.restore_references(output_path)
|
data/bin/peptide_prophet.rb
CHANGED
@@ -85,7 +85,12 @@ end
|
|
85
85
|
prophet_tool.options.decoy_prefix="decoy"
|
86
86
|
prophet_tool.option_parser.on( '--decoy-prefix prefix', 'Prefix for decoy sequences') do |prefix|
|
87
87
|
prophet_tool.options.decoy_prefix = prefix
|
88
|
-
end
|
88
|
+
end
|
89
|
+
|
90
|
+
prophet_tool.options.no_decoys = false
|
91
|
+
prophet_tool.option_parser.on( '--no-decoy', 'Don\'t use decoy sequences to pin down the negative distribution') do
|
92
|
+
prophet_tool.options.no_decoys = true
|
93
|
+
end
|
89
94
|
|
90
95
|
prophet_tool.options.override_database=nil
|
91
96
|
prophet_tool.option_parser.on( '--override-database database', 'Manually specify database') do |database|
|
@@ -207,12 +212,14 @@ def generate_command(genv,prophet_tool,inputs,output,database,engine)
|
|
207
212
|
cmd << " -I2 -T3 -I4 -I5 -I6 -I7 "
|
208
213
|
end
|
209
214
|
|
210
|
-
|
211
|
-
|
212
|
-
|
213
|
-
|
214
|
-
|
215
|
-
|
215
|
+
unless prophet_tool.no_decoys
|
216
|
+
|
217
|
+
if engine=="omssa" || engine=="phenyx"
|
218
|
+
cmd << " -Op -P -d#{prophet_tool.decoy_prefix} "
|
219
|
+
else
|
220
|
+
cmd << " -d#{prophet_tool.decoy_prefix} "
|
221
|
+
end
|
222
|
+
end
|
216
223
|
|
217
224
|
if ( inputs.class==Array)
|
218
225
|
cmd << " #{inputs.join(" ")}"
|
@@ -0,0 +1,624 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
#
|
3
|
+
# This file is part of protk
|
4
|
+
# Original python version created by Max Grant
|
5
|
+
# Translated to ruby by Ira Cooke 29/1/2013
|
6
|
+
#
|
7
|
+
#
|
8
|
+
|
9
|
+
require 'protk/constants'
|
10
|
+
require 'protk/tool'
|
11
|
+
require 'protk/fastadb'
|
12
|
+
require 'protk/gapped_aligner'
|
13
|
+
require 'libxml'
|
14
|
+
require 'bio'
|
15
|
+
|
16
|
+
include LibXML
|
17
|
+
|
18
|
+
tool=Tool.new([:explicit_output])
|
19
|
+
tool.option_parser.banner = "Create a gff containing peptide Observations.\n\nUsage: protxml_to_gff.rb "
|
20
|
+
|
21
|
+
|
22
|
+
tool.options.protxml=nil
|
23
|
+
tool.option_parser.on( '-p filename','--protxml filename', 'Observed Data (ProtXML Format)' ) do |file|
|
24
|
+
tool.options.protxml=file
|
25
|
+
end
|
26
|
+
|
27
|
+
tool.options.database=nil
|
28
|
+
tool.option_parser.on( '-d filename','--database filename', 'Database used for ms/ms searches (Fasta Format)' ) do |file|
|
29
|
+
tool.options.database=file
|
30
|
+
end
|
31
|
+
|
32
|
+
tool.options.genome=nil
|
33
|
+
tool.option_parser.on( '-g filename','--genome filename', 'Nucleotide sequences for scaffolds (Fasta Format)' ) do |file|
|
34
|
+
tool.options.genome=file
|
35
|
+
end
|
36
|
+
|
37
|
+
tool.options.skip_fasta_indexing=false
|
38
|
+
tool.option_parser.on('--skip-index','Don\'t index database (Index should already exist)') do
|
39
|
+
tool.options.skip_fasta_indexing=true
|
40
|
+
end
|
41
|
+
|
42
|
+
tool.options.peptide_probability_threshold=0.95
|
43
|
+
tool.option_parser.on('--threshold prob','Peptide Probability Threshold (Default 0.95)') do |thresh|
|
44
|
+
tool.options.peptide_probability_threshold=thresh.to_f
|
45
|
+
end
|
46
|
+
|
47
|
+
exit unless tool.check_options [:protxml,:database]
|
48
|
+
|
49
|
+
gff_out_file="peptides.gff"
|
50
|
+
if ( tool.explicit_output != nil)
|
51
|
+
gff_out_file=tool.explicit_output
|
52
|
+
end
|
53
|
+
|
54
|
+
gff_db = Bio::GFF.new()
|
55
|
+
f = open(gff_out_file,'w+')
|
56
|
+
|
57
|
+
|
58
|
+
def parse_proteins(protxml_file)
|
59
|
+
puts "Parsing proteins from protxml"
|
60
|
+
protxml_parser=XML::Parser.file(protxml_file)
|
61
|
+
protxml_doc=protxml_parser.parse
|
62
|
+
proteins = protxml_doc.find('.//protxml:protein','protxml:http://regis-web.systemsbiology.net/protXML')
|
63
|
+
proteins
|
64
|
+
end
|
65
|
+
|
66
|
+
def prepare_fasta(database_path,type)
|
67
|
+
db_filename = nil
|
68
|
+
case
|
69
|
+
when Pathname.new(database_path).exist? # It's an explicitly named db
|
70
|
+
db_filename = Pathname.new(database_path).realpath.to_s
|
71
|
+
else
|
72
|
+
db_filename=Constants.new.current_database_for_name(database_path)
|
73
|
+
end
|
74
|
+
|
75
|
+
db_indexfilename = "#{db_filename}.pin"
|
76
|
+
|
77
|
+
if File.exist?(db_indexfilename)
|
78
|
+
puts "Using existing indexed database"
|
79
|
+
orf_lookup = FastaDB.new(db_filename)
|
80
|
+
else
|
81
|
+
puts "Indexing database"
|
82
|
+
orf_lookup = FastaDB.create(db_filename,db_filename,type)
|
83
|
+
end
|
84
|
+
orf_lookup
|
85
|
+
end
|
86
|
+
|
87
|
+
def protein_names(protein_node)
|
88
|
+
indis_proteins = protein_node.find('protxml:indistinguishable_protein','protxml:http://regis-web.systemsbiology.net/protXML')
|
89
|
+
prot_names = [protein_node['protein_name']]
|
90
|
+
for protein in indis_proteins
|
91
|
+
prot_names += [protein['protein_name']]
|
92
|
+
end
|
93
|
+
prot_names
|
94
|
+
end
|
95
|
+
|
96
|
+
def peptide_nodes(protein_node)
|
97
|
+
protein_node.find('protxml:peptide','protxml:http://regis-web.systemsbiology.net/protXML')
|
98
|
+
end
|
99
|
+
|
100
|
+
|
101
|
+
def get_fasta_record(protein_name,fastadb)
|
102
|
+
# puts "Looking up #{protein_name}"
|
103
|
+
entry = fastadb.get_by_id protein_name
|
104
|
+
if ( entry == nil)
|
105
|
+
puts "Failed lookup for #{protein_name}"
|
106
|
+
raise KeyError
|
107
|
+
end
|
108
|
+
entry
|
109
|
+
end
|
110
|
+
|
111
|
+
class CDSInfo
|
112
|
+
attr_accessor :fasta_id
|
113
|
+
attr_accessor :strand
|
114
|
+
attr_accessor :frame
|
115
|
+
attr_accessor :name
|
116
|
+
attr_accessor :scaffold
|
117
|
+
attr_accessor :start
|
118
|
+
attr_accessor :end
|
119
|
+
attr_accessor :coding_sequences
|
120
|
+
attr_accessor :is_sixframe
|
121
|
+
attr_accessor :gene_id
|
122
|
+
|
123
|
+
def overlap(candidate_entry)
|
124
|
+
return false if candidate_entry.scaffold!=self.scaffold
|
125
|
+
return false if strand!=self.strand
|
126
|
+
return false if candidate_entry.start >= self.end
|
127
|
+
return false if self.start <= candidate_entry.end
|
128
|
+
return true
|
129
|
+
end
|
130
|
+
|
131
|
+
end
|
132
|
+
|
133
|
+
def cds_info_from_fasta(fasta_entry)
|
134
|
+
info=CDSInfo.new
|
135
|
+
info.fasta_id=fasta_entry
|
136
|
+
positions = fasta_entry.identifiers.description.split(' ').collect { |coords| coords.split('|').collect {|pos| pos.to_i} }
|
137
|
+
info.coding_sequences=[]
|
138
|
+
info.gene_id
|
139
|
+
if ( positions.length < 1 )
|
140
|
+
raise EncodingError
|
141
|
+
elsif ( positions.length > 1)
|
142
|
+
info.coding_sequences = positions[1..-1]
|
143
|
+
end
|
144
|
+
|
145
|
+
info.start = positions[0][0]
|
146
|
+
info.end = positions[0][1]
|
147
|
+
|
148
|
+
info.scaffold=fasta_entry.entry_id.scan(/(scaffold_?\d+)_/)[0][0]
|
149
|
+
info.name = fasta_entry.entry_id.scan(/lcl\|(.*)/)[0][0]
|
150
|
+
|
151
|
+
if fasta_entry.entry_id =~ /frame/
|
152
|
+
info.frame=info.name.scan(/frame_(\d)/)[0][0]
|
153
|
+
info.strand = (info.frame.to_i > 3) ? '-' : '+'
|
154
|
+
info.is_sixframe = true
|
155
|
+
else
|
156
|
+
info.strand = (info.name =~ /rev/) ? '-' : '+'
|
157
|
+
info.gene_id=info.name.scan(/_\w{3}_(.*)\.t/)[0][0]
|
158
|
+
info.is_sixframe = false
|
159
|
+
end
|
160
|
+
info
|
161
|
+
end
|
162
|
+
|
163
|
+
|
164
|
+
def is_new_genome_location(candidate_entry,existing_entries)
|
165
|
+
# puts existing_entries
|
166
|
+
# require 'debugger';debugger
|
167
|
+
|
168
|
+
# genes=existing_entries.collect { |e| e.gene_id }.compact
|
169
|
+
|
170
|
+
# if genes.include?(candidate_entry.gene_id)
|
171
|
+
# return false
|
172
|
+
# end
|
173
|
+
|
174
|
+
existing_entries.each do |existing|
|
175
|
+
return false if existing.gene_id==candidate_entry.gene_id
|
176
|
+
return false if existing.overlap(candidate_entry)
|
177
|
+
end
|
178
|
+
|
179
|
+
return true
|
180
|
+
end
|
181
|
+
|
182
|
+
|
183
|
+
def generate_protein_gff(protein_name,entry_info,prot_prob,prot_id)
|
184
|
+
prot_qualifiers = {"source" => "MSMS", "score" => prot_prob, "ID" => prot_id}
|
185
|
+
prot_attributes = [["ID",prot_id],["Name",entry_info.name]]
|
186
|
+
prot_gff_line = Bio::GFF::GFF3::Record.new(seqid = entry_info.scaffold,source="MSMS",feature_type="protein",
|
187
|
+
start_position=entry_info.start,end_position=entry_info.end,score=prot_prob,strand=entry_info.strand,frame=nil,attributes=prot_attributes)
|
188
|
+
prot_gff_line
|
189
|
+
end
|
190
|
+
|
191
|
+
def get_dna_sequence(protein_info,genomedb)
|
192
|
+
|
193
|
+
scaffold_sequence = get_fasta_record(protein_info.scaffold,genomedb)
|
194
|
+
gene_sequence = scaffold_sequence.naseq.to_s[(protein_info.start-1)..protein_info.end]
|
195
|
+
|
196
|
+
if ( protein_info.strand == "-")
|
197
|
+
gene_sequence = Bio::Sequence::NA.new(gene_sequence).reverse_complement
|
198
|
+
end
|
199
|
+
|
200
|
+
gene_sequence
|
201
|
+
end
|
202
|
+
|
203
|
+
def peptide_is_in_sixframe(pep_seq,gene_seq)
|
204
|
+
gs=Bio::Sequence::NA.new(gene_seq)
|
205
|
+
(1..6).each do |frame|
|
206
|
+
if gs.translate(frame).index(pep_seq)
|
207
|
+
return true
|
208
|
+
end
|
209
|
+
end
|
210
|
+
return false
|
211
|
+
end
|
212
|
+
|
213
|
+
# gene_seq should already have been reverse_complemented if on reverse strand
|
214
|
+
def get_peptide_coordinates_by_alignment(prot_seq,pep_seq,protein_info,gene_seq)
|
215
|
+
if ( peptide_is_in_sixframe(pep_seq,gene_seq))
|
216
|
+
return nil
|
217
|
+
else
|
218
|
+
puts "Warning. Actually found a gap #{protein_info.fasta_id}"
|
219
|
+
aln=GappedAligner.new().align(pep_seq,gene_seq)
|
220
|
+
unless aln.gaps.length==1
|
221
|
+
puts "More than one intron.#{aln}"
|
222
|
+
require 'debugger';debugger
|
223
|
+
end
|
224
|
+
pep_coords = []
|
225
|
+
frags = aln.fragments
|
226
|
+
frags.reverse! if protein_info.strand=='-'
|
227
|
+
|
228
|
+
frags.each { |frag|
|
229
|
+
if protein_info.strand=='+'
|
230
|
+
frag_genomic_start = protein_info.start + frag[0]
|
231
|
+
frag_genomic_end = protein_info.start + frag[1]
|
232
|
+
else
|
233
|
+
frag_genomic_start = protein_info.end - frag[1]
|
234
|
+
frag_genomic_end = protein_info.end - frag[0]
|
235
|
+
end
|
236
|
+
pep_coords << frag_genomic_start
|
237
|
+
pep_coords << frag_genomic_end
|
238
|
+
}
|
239
|
+
|
240
|
+
return [pep_coords]
|
241
|
+
end
|
242
|
+
end
|
243
|
+
|
244
|
+
def fragment_coords_from_protein_coords(pepstart,pepend,gene_start,gene_end,coding_sequences)
|
245
|
+
|
246
|
+
sorted_cds = coding_sequences.sort { |a, b| a[0] <=> b[0] }
|
247
|
+
|
248
|
+
# Assume positive strand
|
249
|
+
pi_start=pepstart*3+gene_start-1
|
250
|
+
pi_end=pepend*3+gene_start-1
|
251
|
+
|
252
|
+
fragments=[]
|
253
|
+
p_i = pi_start #Initially we are looking for the first fragment
|
254
|
+
finding_start=true
|
255
|
+
|
256
|
+
sorted_cds.each_with_index do |cds_coords, i|
|
257
|
+
cds_start=cds_coords[0]
|
258
|
+
cds_end = cds_coords[1]
|
259
|
+
if cds_end < p_i # Exon is before index in sequence and doesn't contain p_i
|
260
|
+
if sorted_cds.length <= i+1
|
261
|
+
require 'debugger';debugger
|
262
|
+
end
|
263
|
+
|
264
|
+
next_coords = sorted_cds[i+1]
|
265
|
+
intron_offset = ((next_coords[0]-cds_end)-1)
|
266
|
+
p_i+=intron_offset
|
267
|
+
pi_end+=intron_offset
|
268
|
+
if !finding_start
|
269
|
+
# This is a middle exon
|
270
|
+
fragments << [cds_start,cds_end]
|
271
|
+
end
|
272
|
+
else
|
273
|
+
if finding_start
|
274
|
+
fragments << [p_i+1,(cds_end)]
|
275
|
+
next_coords = sorted_cds[i+1]
|
276
|
+
intron_offset = ((next_coords[0]-cds_end)-1)
|
277
|
+
p_i+=intron_offset
|
278
|
+
pi_end+=intron_offset
|
279
|
+
p_i = pi_end
|
280
|
+
finding_start=false
|
281
|
+
else # A terminal exon
|
282
|
+
# require 'debugger';debugger
|
283
|
+
fragments << [(cds_start),(p_i)]
|
284
|
+
break;
|
285
|
+
end
|
286
|
+
end
|
287
|
+
end
|
288
|
+
[fragments]
|
289
|
+
end
|
290
|
+
|
291
|
+
# gene_seq should already have been reverse_complemented if on reverse strand
|
292
|
+
def get_peptide_coordinates_from_transcript_info(prot_seq,pep_seq,protein_info,gene_seq)
|
293
|
+
if ( peptide_is_in_sixframe(pep_seq,gene_seq))
|
294
|
+
return nil
|
295
|
+
else
|
296
|
+
|
297
|
+
# puts "Found a gap #{protein_info.fasta_id}"
|
298
|
+
if protein_info.strand=='-'
|
299
|
+
pep_index = prot_seq.reverse.index(pep_seq.reverse)
|
300
|
+
if pep_index==nil
|
301
|
+
# require 'debugger';debugger
|
302
|
+
puts "Warning: Unable to find peptide #{pep_seq} in this protein! #{protein_info}"
|
303
|
+
return nil
|
304
|
+
end
|
305
|
+
pep_start_i = prot_seq.reverse.index(pep_seq.reverse)+1
|
306
|
+
# Plus 1 because on reverse stand stop-codon will be at the beginning of the sequence (when read forwards). Need to eliminate it.
|
307
|
+
else
|
308
|
+
pep_start_i = prot_seq.index(pep_seq)
|
309
|
+
if pep_start_i==nil
|
310
|
+
# require 'debugger';debugger
|
311
|
+
puts "Warning: Unable to find peptide #{pep_seq} in this protein! #{protein_info}"
|
312
|
+
return nil
|
313
|
+
end
|
314
|
+
end
|
315
|
+
pep_end_i = pep_start_i+pep_seq.length
|
316
|
+
|
317
|
+
return fragment_coords_from_protein_coords(pep_start_i,pep_end_i,protein_info.start,protein_info.end,protein_info.coding_sequences)
|
318
|
+
end
|
319
|
+
end
|
320
|
+
|
321
|
+
def get_peptide_coordinates_sixframe(prot_seq,pep_seq,protein_info)
|
322
|
+
|
323
|
+
if ( protein_info.strand == '-' )
|
324
|
+
prot_seq = prot_seq.reverse
|
325
|
+
pep_seq = pep_seq.reverse
|
326
|
+
end
|
327
|
+
|
328
|
+
start_indexes = [0]
|
329
|
+
|
330
|
+
prot_seq.scan /#{pep_seq}/ do |match|
|
331
|
+
start_indexes << prot_seq.index(match,start_indexes.last)
|
332
|
+
end
|
333
|
+
start_indexes.delete_at(0)
|
334
|
+
|
335
|
+
start_indexes.collect do |si|
|
336
|
+
pep_genomic_start = protein_info.start + 3*si
|
337
|
+
pep_genomic_end = pep_genomic_start + 3*pep_seq.length - 1
|
338
|
+
[[pep_genomic_start,pep_genomic_end]]
|
339
|
+
end
|
340
|
+
|
341
|
+
end
|
342
|
+
|
343
|
+
# Returns a 4-mer [genomic_start,fragment1_end(or0),frag2_start(or0),genomic_end]
|
344
|
+
def get_peptide_coordinates(prot_seq,pep_seq,protein_info,gene_seq)
|
345
|
+
if ( protein_info.is_sixframe)
|
346
|
+
return get_peptide_coordinates_sixframe(prot_seq,pep_seq,protein_info)
|
347
|
+
else
|
348
|
+
return get_peptide_coordinates_from_transcript_info(prot_seq,pep_seq,protein_info,gene_seq)
|
349
|
+
end
|
350
|
+
end
|
351
|
+
|
352
|
+
|
353
|
+
def generate_fragment_gffs_for_coords(coords,protein_info,pep_id,peptide_seq,genomedb,name="fragment")
|
354
|
+
scaff = get_fasta_record(protein_info.scaffold,genomedb)
|
355
|
+
scaffold_seq = Bio::Sequence::NA.new(scaff.seq)
|
356
|
+
|
357
|
+
fragment_phase = 0
|
358
|
+
ordered_coords= protein_info.strand=='+' ? coords : coords.reverse
|
359
|
+
if name=="CDS"
|
360
|
+
frag_id="#{pep_id}.fg"
|
361
|
+
else
|
362
|
+
frag_id="#{pep_id}.sp"
|
363
|
+
end
|
364
|
+
gff_lines = ordered_coords.collect do |frag_start,frag_end|
|
365
|
+
frag_naseq = scaffold_seq[frag_start-1..frag_end-1]
|
366
|
+
|
367
|
+
begin
|
368
|
+
frag_frame = fragment_phase+1
|
369
|
+
frag_seq = nil
|
370
|
+
if ( protein_info.strand=='-')
|
371
|
+
frag_seq = frag_naseq.reverse_complement.translate(frag_frame)
|
372
|
+
else
|
373
|
+
frag_seq = frag_naseq.translate(frag_frame)
|
374
|
+
end
|
375
|
+
rescue
|
376
|
+
if frag_naseq.length > 1
|
377
|
+
puts "Unable to translate #{frag_naseq}"
|
378
|
+
# require 'debugger'
|
379
|
+
end
|
380
|
+
frag_seq="*"
|
381
|
+
end
|
382
|
+
|
383
|
+
fragment_record=Bio::GFF::GFF3::Record.new(seqid = protein_info.scaffold,source="MSMS",
|
384
|
+
feature_type=name,start_position=frag_start,end_position=frag_end,score='',
|
385
|
+
strand=protein_info.strand,frame=fragment_phase,attributes=[["Parent",pep_id],["ID",frag_id],["Name",frag_seq]])
|
386
|
+
|
387
|
+
|
388
|
+
remainder=(frag_naseq.length-fragment_phase) % 3
|
389
|
+
fragment_phase=(3-remainder) % 3
|
390
|
+
|
391
|
+
fragment_record
|
392
|
+
end
|
393
|
+
|
394
|
+
|
395
|
+
concat_seq=nil
|
396
|
+
|
397
|
+
coords.each do |frag_start,frag_end|
|
398
|
+
frag_naseq = scaffold_seq[frag_start-1..frag_end-1]
|
399
|
+
concat_seq += frag_naseq unless concat_seq == nil
|
400
|
+
concat_seq = frag_naseq if concat_seq==nil
|
401
|
+
end
|
402
|
+
|
403
|
+
check_seq = protein_info.strand=='-' ? concat_seq.reverse_complement.translate : concat_seq.translate
|
404
|
+
if ( check_seq != peptide_seq)
|
405
|
+
require 'debugger';debugger
|
406
|
+
puts "Fragment seqs not equal to peptide seqs"
|
407
|
+
end
|
408
|
+
|
409
|
+
return gff_lines
|
410
|
+
|
411
|
+
end
|
412
|
+
|
413
|
+
def get_start_codon_coords_for_peptide(peptide_genomic_start,peptide_genomic_end,peptide_seq,protein_seq,strand)
|
414
|
+
pi=protein_seq.index(peptide_seq)
|
415
|
+
if ( protein_seq[pi]=='M' )
|
416
|
+
is_tryptic=false
|
417
|
+
if ( pi>0 && (protein_seq[pi-1]!='K' && protein_seq[pi-1]!='R') )
|
418
|
+
is_tryptic=true
|
419
|
+
elsif (pi==0)
|
420
|
+
is_tryptic=true
|
421
|
+
end
|
422
|
+
return nil unless is_tryptic
|
423
|
+
|
424
|
+
start_codon_coord = (strand=='+') ? peptide_genomic_start : peptide_genomic_end-1
|
425
|
+
# require 'debugger';debugger
|
426
|
+
return [start_codon_coord,start_codon_coord+2]
|
427
|
+
else
|
428
|
+
return nil
|
429
|
+
end
|
430
|
+
end
|
431
|
+
|
432
|
+
def get_cterm_coords_for_peptide(peptide_genomic_start,peptide_genomic_end,peptide_seq,protein_seq,strand)
|
433
|
+
|
434
|
+
if ( (peptide_seq[-1]!='K' && peptide_seq[-1]!='R' ) )
|
435
|
+
|
436
|
+
codon_coord = (strand=='+') ? peptide_genomic_end-3 : peptide_genomic_start+1
|
437
|
+
# require 'debugger';debugger
|
438
|
+
return [codon_coord,codon_coord+2]
|
439
|
+
else
|
440
|
+
return nil
|
441
|
+
end
|
442
|
+
end
|
443
|
+
|
444
|
+
|
445
|
+
def get_signal_peptide_for_peptide(peptide_seq,protein_seq)
|
446
|
+
pi=protein_seq.index(peptide_seq)
|
447
|
+
if ( pi>0 && (protein_seq[pi-1]!='K' && protein_seq[pi-1]!='R' && protein_seq[pi]!='M') )
|
448
|
+
reverse_leader_seq=protein_seq[0..pi].reverse
|
449
|
+
mi=reverse_leader_seq.index('M')
|
450
|
+
|
451
|
+
if ( mi==nil )
|
452
|
+
puts "No methionine found ahead of peptide sequence. Unable to determine signal peptide sequence"
|
453
|
+
return nil
|
454
|
+
end
|
455
|
+
|
456
|
+
mi=pi-mi
|
457
|
+
|
458
|
+
return protein_seq[mi..(pi-1)]
|
459
|
+
else
|
460
|
+
return nil
|
461
|
+
end
|
462
|
+
end
|
463
|
+
|
464
|
+
def generate_gff_for_peptide_mapped_to_protein(protein_seq,peptide_seq,protein_info,prot_id,peptide_prob,peptide_count,genomedb=nil)
|
465
|
+
|
466
|
+
dna_sequence=nil
|
467
|
+
if !protein_info.is_sixframe
|
468
|
+
throw "A genome is required if predicted transcripts are to be mapped" unless genomedb!=nil
|
469
|
+
dna_sequence = get_dna_sequence(protein_info,genomedb)
|
470
|
+
end
|
471
|
+
|
472
|
+
prot_seq = protein_seq
|
473
|
+
pep_seq = peptide_seq
|
474
|
+
|
475
|
+
|
476
|
+
peptide_coords = get_peptide_coordinates(prot_seq,pep_seq,protein_info,dna_sequence)
|
477
|
+
|
478
|
+
if ( peptide_coords==nil ) # Return value of nil means the entry is a predicted transcript that should already be covered by 6-frame
|
479
|
+
return []
|
480
|
+
end
|
481
|
+
|
482
|
+
gff_records=[]
|
483
|
+
|
484
|
+
# Now convert peptide coordinate to genome coordinates
|
485
|
+
# And create gff lines for each match
|
486
|
+
peptide_coords.each do |coords|
|
487
|
+
|
488
|
+
# require 'debugger';debugger
|
489
|
+
pep_genomic_start = coords.first[0]
|
490
|
+
pep_genomic_end = coords.last[1]
|
491
|
+
|
492
|
+
pep_id = "#{prot_id}.p#{peptide_count.to_s}"
|
493
|
+
pep_attributes = [["ID",pep_id],["Parent",prot_id],["Name",pep_seq]]
|
494
|
+
|
495
|
+
pep_gff_line = Bio::GFF::GFF3::Record.new(seqid = protein_info.scaffold,source="MSMS",
|
496
|
+
feature_type="peptide",start_position=pep_genomic_start,end_position=pep_genomic_end,score=peptide_prob,
|
497
|
+
strand=protein_info.strand,frame=nil,attributes=pep_attributes)
|
498
|
+
|
499
|
+
# For standard peptides
|
500
|
+
frag_gffs = generate_fragment_gffs_for_coords(coords,protein_info,pep_id,peptide_seq,genomedb,"CDS")
|
501
|
+
gff_records += [pep_gff_line] + frag_gffs
|
502
|
+
# require 'debugger';debugger
|
503
|
+
# For peptides with only 1 tryptic terminus
|
504
|
+
start_codon_coords=get_start_codon_coords_for_peptide(pep_genomic_start,pep_genomic_end,peptide_seq,protein_seq,protein_info.strand)
|
505
|
+
if start_codon_coords
|
506
|
+
start_codon_gff = Bio::GFF::GFF3::Record.new(seqid = protein_info.scaffold,source="MSMS",
|
507
|
+
feature_type="start_codon",start_position=start_codon_coords[0],end_position=start_codon_coords[1],score='',
|
508
|
+
strand=protein_info.strand,frame=nil,attributes=["Parent",pep_id])
|
509
|
+
gff_records+=[start_codon_gff]
|
510
|
+
end
|
511
|
+
|
512
|
+
cterm_coords = get_cterm_coords_for_peptide(pep_genomic_start,pep_genomic_end,peptide_seq,protein_seq,protein_info.strand)
|
513
|
+
if ( cterm_coords )
|
514
|
+
cterm_gff = Bio::GFF::GFF3::Record.new(seqid = protein_info.scaffold,source="MSMS",
|
515
|
+
feature_type="cterm",start_position=cterm_coords[0],end_position=cterm_coords[1],score='',
|
516
|
+
strand=protein_info.strand,frame=nil,attributes=["Parent",pep_id])
|
517
|
+
gff_records+=[start_codon_gff]
|
518
|
+
end
|
519
|
+
|
520
|
+
signal_peptide = get_signal_peptide_for_peptide(peptide_seq,protein_seq)
|
521
|
+
if signal_peptide
|
522
|
+
# require 'debugger';debugger
|
523
|
+
|
524
|
+
signal_peptide_coords=get_peptide_coordinates(prot_seq,signal_peptide,protein_info,dna_sequence)
|
525
|
+
if signal_peptide_coords
|
526
|
+
signal_peptide_coords.each do |spcoords|
|
527
|
+
signal_peptide_gff = generate_fragment_gffs_for_coords(spcoords,protein_info,pep_id,signal_peptide,genomedb,"signalpeptide")
|
528
|
+
gff_records += signal_peptide_gff
|
529
|
+
end
|
530
|
+
end
|
531
|
+
end
|
532
|
+
|
533
|
+
|
534
|
+
end
|
535
|
+
puts gff_records
|
536
|
+
|
537
|
+
gff_records
|
538
|
+
end
|
539
|
+
|
540
|
+
proteins = parse_proteins(tool.protxml)
|
541
|
+
fastadb = prepare_fasta(tool.database,'prot')
|
542
|
+
genomedb = nil
|
543
|
+
if tool.genome
|
544
|
+
genomedb = prepare_fasta(tool.genome,'nucl')
|
545
|
+
end
|
546
|
+
|
547
|
+
puts "Aligning peptides and writing GFF data..."
|
548
|
+
|
549
|
+
low_prob = 0
|
550
|
+
skipped = 0
|
551
|
+
peptide_count = 0
|
552
|
+
protein_count = 0
|
553
|
+
total_peptides = 0
|
554
|
+
|
555
|
+
for prot in proteins
|
556
|
+
prot_prob = prot['probability']
|
557
|
+
if ( prot_prob.to_f < tool.peptide_probability_threshold )
|
558
|
+
next
|
559
|
+
end
|
560
|
+
|
561
|
+
# Gets identifiers of all proteins (includeing indistinguishable ones)
|
562
|
+
prot_names=protein_names(prot)
|
563
|
+
|
564
|
+
peptides=peptide_nodes(prot)
|
565
|
+
entries_covered=[]
|
566
|
+
for protein_name in prot_names
|
567
|
+
protein_count += 1
|
568
|
+
prot_id = "pr#{protein_count.to_s}"
|
569
|
+
begin
|
570
|
+
|
571
|
+
protein_fasta_entry = get_fasta_record(protein_name,fastadb)
|
572
|
+
protein_info = cds_info_from_fasta(protein_fasta_entry)
|
573
|
+
|
574
|
+
if is_new_genome_location(protein_info,entries_covered)
|
575
|
+
|
576
|
+
protein_gff = generate_protein_gff(protein_name,protein_info,prot_prob,protein_count)
|
577
|
+
|
578
|
+
gff_db.records += ["##gff-version 3\n","##sequence-region #{protein_info.scaffold} 1 160\n",protein_gff]
|
579
|
+
|
580
|
+
prot_seq = protein_fasta_entry.aaseq.to_s
|
581
|
+
throw "Not amino_acids" if prot_seq != protein_fasta_entry.seq.to_s
|
582
|
+
|
583
|
+
peptide_count=1
|
584
|
+
for peptide in peptides
|
585
|
+
pprob = peptide['nsp_adjusted_probability'].to_f
|
586
|
+
if ( pprob >= tool.peptide_probability_threshold )
|
587
|
+
total_peptides += 1
|
588
|
+
pep_seq = peptide['peptide_sequence']
|
589
|
+
|
590
|
+
gff_db.records += generate_gff_for_peptide_mapped_to_protein(prot_seq,pep_seq,protein_info,prot_id,pprob,peptide_count,genomedb)
|
591
|
+
peptide_count+=1
|
592
|
+
end
|
593
|
+
end
|
594
|
+
else
|
595
|
+
puts "Skipping redundant entry #{protein_name}"
|
596
|
+
protein_count-=1 # To counter +1 prior to begin rescue end block
|
597
|
+
end
|
598
|
+
|
599
|
+
entries_covered<<protein_info
|
600
|
+
|
601
|
+
# puts protein_gff
|
602
|
+
# puts gff_db.records
|
603
|
+
rescue KeyError,EncodingError
|
604
|
+
skipped+=0
|
605
|
+
end
|
606
|
+
|
607
|
+
# exit
|
608
|
+
end
|
609
|
+
|
610
|
+
end
|
611
|
+
|
612
|
+
f = open(gff_out_file,'w+')
|
613
|
+
gff_db.records.each { |rec|
|
614
|
+
f.write(rec.to_s)
|
615
|
+
}
|
616
|
+
f.close
|
617
|
+
|
618
|
+
p "Finished."
|
619
|
+
p "Proteins: #{protein_count}"
|
620
|
+
p "Skipped Decoys: #{skipped}"
|
621
|
+
p "Total Peptides: #{total_peptides}"
|
622
|
+
p "Peptides Written: #{total_peptides - low_prob}"
|
623
|
+
p "Peptides Culled: #{low_prob}"
|
624
|
+
exit(0)
|