protk 1.2.4 → 1.2.5
Sign up to get free protection for your applications and to get access to all the features.
- data/bin/add_retention_times.rb +89 -0
- data/bin/augustus_to_proteindb.rb +193 -0
- data/bin/blastxml_to_table.rb +72 -0
- data/bin/feature_finder.rb +7 -1
- data/bin/make_decoy.rb +10 -2
- data/bin/mascot_search.rb +14 -4
- data/bin/msgfplus_search.rb +14 -5
- data/bin/peptide_prophet.rb +14 -7
- data/bin/protxml_to_gff.rb +624 -0
- data/bin/protxml_to_table.rb +19 -2
- data/bin/sixframe.rb +3 -1
- data/bin/tandem_search.rb +51 -23
- data/bin/toppas_pipeline.rb +8 -3
- data/bin/uniprot_annotation.rb +6 -1
- data/ext/protk/{protk.c → decoymaker/decoymaker.c} +13 -15
- data/ext/protk/decoymaker/extconf.rb +3 -0
- data/ext/protk/simplealign/extconf.rb +3 -0
- data/lib/protk/data/FeatureFinderIsotopeWavelet.ini +6 -6
- data/lib/protk/gapped_aligner.rb +264 -0
- data/lib/protk/manage_db_rakefile.rake +2 -1
- data/lib/protk/mascot_util.rb +7 -2
- data/lib/protk/randomize.rb +2 -2
- data/lib/protk/search_tool.rb +1 -1
- data/lib/protk/setup_rakefile.rake +25 -2
- data/lib/protk/spreadsheet_extensions.rb +1 -0
- data/lib/protk/swissprot_database.rb +11 -1
- metadata +30 -8
- data/bin/mascot2xml.rb +0 -87
- data/ext/protk/extconf.rb +0 -3
- data/lib/protk/data/pepxml_mascot_template.xml +0 -29
- data/lib/protk/data/predefined_db.trembl_annotation.yaml +0 -20
data/bin/msgfplus_search.rb
CHANGED
@@ -17,9 +17,10 @@ input_stager = nil
|
|
17
17
|
|
18
18
|
# Setup specific command-line options for this tool. Other options are inherited from SearchTool
|
19
19
|
#
|
20
|
-
search_tool=SearchTool.new([:database,:explicit_output,:over_write,:enzyme,
|
20
|
+
search_tool=SearchTool.new([:background,:database,:explicit_output,:over_write,:enzyme,
|
21
21
|
:modifications,:instrument,:mass_tolerance_units,:mass_tolerance,:missed_cleavages])
|
22
22
|
|
23
|
+
search_tool.jobid_prefix="p"
|
23
24
|
search_tool.option_parser.banner = "Run an MSGFPlus msms search on a set of msms spectrum input files.\n\nUsage: msgfplus_search.rb [options] file1.mzML file2.mzML ..."
|
24
25
|
search_tool.options.output_suffix="_msgfplus"
|
25
26
|
|
@@ -135,7 +136,7 @@ ARGV.each do |filename|
|
|
135
136
|
if ( search_tool.explicit_output!=nil)
|
136
137
|
output_path=search_tool.explicit_output
|
137
138
|
else
|
138
|
-
output_path="#{search_tool.output_base_path(filename.chomp)}.
|
139
|
+
output_path="#{search_tool.output_base_path(filename.chomp)}.pep.xml"
|
139
140
|
end
|
140
141
|
|
141
142
|
|
@@ -232,20 +233,28 @@ ARGV.each do |filename|
|
|
232
233
|
# As a final part of the command we convert to pepxml
|
233
234
|
if search_tool.no_pepxml
|
234
235
|
cmd << "; cp #{mzid_output_path} #{output_path}"
|
235
|
-
|
236
|
+
else
|
237
|
+
#if search_tool.explicit_output
|
236
238
|
cmd << "; #{genv.idconvert} #{mzid_output_path} --pepXML -o #{Pathname.new(mzid_output_path).dirname}"
|
237
239
|
#Then copy the pepxml to the final output path
|
238
|
-
cmd << ";
|
240
|
+
cmd << "; mv #{mzid_output_path.chomp('.mzid')}.pepXML #{output_path}"
|
239
241
|
end
|
240
242
|
|
241
243
|
|
242
244
|
# Up to here we've formulated the command. The rest is cleanup
|
243
245
|
p "Running:#{cmd}"
|
244
246
|
|
247
|
+
# In case the user specified background running we need to create a jobscript path
|
248
|
+
#
|
249
|
+
jobscript_path="#{output_path}.pbs.sh"
|
250
|
+
|
245
251
|
# Run the search
|
246
252
|
#
|
247
253
|
job_params= {:jobid => search_tool.jobid_from_filename(filename) }
|
248
|
-
|
254
|
+
job_params[:queue]="seventytwo"
|
255
|
+
job_params[:vmem]="70gb"
|
256
|
+
code = search_tool.run(cmd,genv,job_params,jobscript_path)
|
257
|
+
throw "Command failed with exit code #{code}" unless code==0
|
249
258
|
|
250
259
|
if for_galaxy
|
251
260
|
input_stager.restore_references(output_path)
|
data/bin/peptide_prophet.rb
CHANGED
@@ -85,7 +85,12 @@ end
|
|
85
85
|
prophet_tool.options.decoy_prefix="decoy"
|
86
86
|
prophet_tool.option_parser.on( '--decoy-prefix prefix', 'Prefix for decoy sequences') do |prefix|
|
87
87
|
prophet_tool.options.decoy_prefix = prefix
|
88
|
-
end
|
88
|
+
end
|
89
|
+
|
90
|
+
prophet_tool.options.no_decoys = false
|
91
|
+
prophet_tool.option_parser.on( '--no-decoy', 'Don\'t use decoy sequences to pin down the negative distribution') do
|
92
|
+
prophet_tool.options.no_decoys = true
|
93
|
+
end
|
89
94
|
|
90
95
|
prophet_tool.options.override_database=nil
|
91
96
|
prophet_tool.option_parser.on( '--override-database database', 'Manually specify database') do |database|
|
@@ -207,12 +212,14 @@ def generate_command(genv,prophet_tool,inputs,output,database,engine)
|
|
207
212
|
cmd << " -I2 -T3 -I4 -I5 -I6 -I7 "
|
208
213
|
end
|
209
214
|
|
210
|
-
|
211
|
-
|
212
|
-
|
213
|
-
|
214
|
-
|
215
|
-
|
215
|
+
unless prophet_tool.no_decoys
|
216
|
+
|
217
|
+
if engine=="omssa" || engine=="phenyx"
|
218
|
+
cmd << " -Op -P -d#{prophet_tool.decoy_prefix} "
|
219
|
+
else
|
220
|
+
cmd << " -d#{prophet_tool.decoy_prefix} "
|
221
|
+
end
|
222
|
+
end
|
216
223
|
|
217
224
|
if ( inputs.class==Array)
|
218
225
|
cmd << " #{inputs.join(" ")}"
|
@@ -0,0 +1,624 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
#
|
3
|
+
# This file is part of protk
|
4
|
+
# Original python version created by Max Grant
|
5
|
+
# Translated to ruby by Ira Cooke 29/1/2013
|
6
|
+
#
|
7
|
+
#
|
8
|
+
|
9
|
+
require 'protk/constants'
|
10
|
+
require 'protk/tool'
|
11
|
+
require 'protk/fastadb'
|
12
|
+
require 'protk/gapped_aligner'
|
13
|
+
require 'libxml'
|
14
|
+
require 'bio'
|
15
|
+
|
16
|
+
include LibXML
|
17
|
+
|
18
|
+
tool=Tool.new([:explicit_output])
|
19
|
+
tool.option_parser.banner = "Create a gff containing peptide Observations.\n\nUsage: protxml_to_gff.rb "
|
20
|
+
|
21
|
+
|
22
|
+
tool.options.protxml=nil
|
23
|
+
tool.option_parser.on( '-p filename','--protxml filename', 'Observed Data (ProtXML Format)' ) do |file|
|
24
|
+
tool.options.protxml=file
|
25
|
+
end
|
26
|
+
|
27
|
+
tool.options.database=nil
|
28
|
+
tool.option_parser.on( '-d filename','--database filename', 'Database used for ms/ms searches (Fasta Format)' ) do |file|
|
29
|
+
tool.options.database=file
|
30
|
+
end
|
31
|
+
|
32
|
+
tool.options.genome=nil
|
33
|
+
tool.option_parser.on( '-g filename','--genome filename', 'Nucleotide sequences for scaffolds (Fasta Format)' ) do |file|
|
34
|
+
tool.options.genome=file
|
35
|
+
end
|
36
|
+
|
37
|
+
tool.options.skip_fasta_indexing=false
|
38
|
+
tool.option_parser.on('--skip-index','Don\'t index database (Index should already exist)') do
|
39
|
+
tool.options.skip_fasta_indexing=true
|
40
|
+
end
|
41
|
+
|
42
|
+
tool.options.peptide_probability_threshold=0.95
|
43
|
+
tool.option_parser.on('--threshold prob','Peptide Probability Threshold (Default 0.95)') do |thresh|
|
44
|
+
tool.options.peptide_probability_threshold=thresh.to_f
|
45
|
+
end
|
46
|
+
|
47
|
+
exit unless tool.check_options [:protxml,:database]
|
48
|
+
|
49
|
+
gff_out_file="peptides.gff"
|
50
|
+
if ( tool.explicit_output != nil)
|
51
|
+
gff_out_file=tool.explicit_output
|
52
|
+
end
|
53
|
+
|
54
|
+
gff_db = Bio::GFF.new()
|
55
|
+
f = open(gff_out_file,'w+')
|
56
|
+
|
57
|
+
|
58
|
+
def parse_proteins(protxml_file)
|
59
|
+
puts "Parsing proteins from protxml"
|
60
|
+
protxml_parser=XML::Parser.file(protxml_file)
|
61
|
+
protxml_doc=protxml_parser.parse
|
62
|
+
proteins = protxml_doc.find('.//protxml:protein','protxml:http://regis-web.systemsbiology.net/protXML')
|
63
|
+
proteins
|
64
|
+
end
|
65
|
+
|
66
|
+
def prepare_fasta(database_path,type)
|
67
|
+
db_filename = nil
|
68
|
+
case
|
69
|
+
when Pathname.new(database_path).exist? # It's an explicitly named db
|
70
|
+
db_filename = Pathname.new(database_path).realpath.to_s
|
71
|
+
else
|
72
|
+
db_filename=Constants.new.current_database_for_name(database_path)
|
73
|
+
end
|
74
|
+
|
75
|
+
db_indexfilename = "#{db_filename}.pin"
|
76
|
+
|
77
|
+
if File.exist?(db_indexfilename)
|
78
|
+
puts "Using existing indexed database"
|
79
|
+
orf_lookup = FastaDB.new(db_filename)
|
80
|
+
else
|
81
|
+
puts "Indexing database"
|
82
|
+
orf_lookup = FastaDB.create(db_filename,db_filename,type)
|
83
|
+
end
|
84
|
+
orf_lookup
|
85
|
+
end
|
86
|
+
|
87
|
+
def protein_names(protein_node)
|
88
|
+
indis_proteins = protein_node.find('protxml:indistinguishable_protein','protxml:http://regis-web.systemsbiology.net/protXML')
|
89
|
+
prot_names = [protein_node['protein_name']]
|
90
|
+
for protein in indis_proteins
|
91
|
+
prot_names += [protein['protein_name']]
|
92
|
+
end
|
93
|
+
prot_names
|
94
|
+
end
|
95
|
+
|
96
|
+
def peptide_nodes(protein_node)
|
97
|
+
protein_node.find('protxml:peptide','protxml:http://regis-web.systemsbiology.net/protXML')
|
98
|
+
end
|
99
|
+
|
100
|
+
|
101
|
+
def get_fasta_record(protein_name,fastadb)
|
102
|
+
# puts "Looking up #{protein_name}"
|
103
|
+
entry = fastadb.get_by_id protein_name
|
104
|
+
if ( entry == nil)
|
105
|
+
puts "Failed lookup for #{protein_name}"
|
106
|
+
raise KeyError
|
107
|
+
end
|
108
|
+
entry
|
109
|
+
end
|
110
|
+
|
111
|
+
class CDSInfo
|
112
|
+
attr_accessor :fasta_id
|
113
|
+
attr_accessor :strand
|
114
|
+
attr_accessor :frame
|
115
|
+
attr_accessor :name
|
116
|
+
attr_accessor :scaffold
|
117
|
+
attr_accessor :start
|
118
|
+
attr_accessor :end
|
119
|
+
attr_accessor :coding_sequences
|
120
|
+
attr_accessor :is_sixframe
|
121
|
+
attr_accessor :gene_id
|
122
|
+
|
123
|
+
def overlap(candidate_entry)
|
124
|
+
return false if candidate_entry.scaffold!=self.scaffold
|
125
|
+
return false if strand!=self.strand
|
126
|
+
return false if candidate_entry.start >= self.end
|
127
|
+
return false if self.start <= candidate_entry.end
|
128
|
+
return true
|
129
|
+
end
|
130
|
+
|
131
|
+
end
|
132
|
+
|
133
|
+
def cds_info_from_fasta(fasta_entry)
|
134
|
+
info=CDSInfo.new
|
135
|
+
info.fasta_id=fasta_entry
|
136
|
+
positions = fasta_entry.identifiers.description.split(' ').collect { |coords| coords.split('|').collect {|pos| pos.to_i} }
|
137
|
+
info.coding_sequences=[]
|
138
|
+
info.gene_id
|
139
|
+
if ( positions.length < 1 )
|
140
|
+
raise EncodingError
|
141
|
+
elsif ( positions.length > 1)
|
142
|
+
info.coding_sequences = positions[1..-1]
|
143
|
+
end
|
144
|
+
|
145
|
+
info.start = positions[0][0]
|
146
|
+
info.end = positions[0][1]
|
147
|
+
|
148
|
+
info.scaffold=fasta_entry.entry_id.scan(/(scaffold_?\d+)_/)[0][0]
|
149
|
+
info.name = fasta_entry.entry_id.scan(/lcl\|(.*)/)[0][0]
|
150
|
+
|
151
|
+
if fasta_entry.entry_id =~ /frame/
|
152
|
+
info.frame=info.name.scan(/frame_(\d)/)[0][0]
|
153
|
+
info.strand = (info.frame.to_i > 3) ? '-' : '+'
|
154
|
+
info.is_sixframe = true
|
155
|
+
else
|
156
|
+
info.strand = (info.name =~ /rev/) ? '-' : '+'
|
157
|
+
info.gene_id=info.name.scan(/_\w{3}_(.*)\.t/)[0][0]
|
158
|
+
info.is_sixframe = false
|
159
|
+
end
|
160
|
+
info
|
161
|
+
end
|
162
|
+
|
163
|
+
|
164
|
+
def is_new_genome_location(candidate_entry,existing_entries)
|
165
|
+
# puts existing_entries
|
166
|
+
# require 'debugger';debugger
|
167
|
+
|
168
|
+
# genes=existing_entries.collect { |e| e.gene_id }.compact
|
169
|
+
|
170
|
+
# if genes.include?(candidate_entry.gene_id)
|
171
|
+
# return false
|
172
|
+
# end
|
173
|
+
|
174
|
+
existing_entries.each do |existing|
|
175
|
+
return false if existing.gene_id==candidate_entry.gene_id
|
176
|
+
return false if existing.overlap(candidate_entry)
|
177
|
+
end
|
178
|
+
|
179
|
+
return true
|
180
|
+
end
|
181
|
+
|
182
|
+
|
183
|
+
def generate_protein_gff(protein_name,entry_info,prot_prob,prot_id)
|
184
|
+
prot_qualifiers = {"source" => "MSMS", "score" => prot_prob, "ID" => prot_id}
|
185
|
+
prot_attributes = [["ID",prot_id],["Name",entry_info.name]]
|
186
|
+
prot_gff_line = Bio::GFF::GFF3::Record.new(seqid = entry_info.scaffold,source="MSMS",feature_type="protein",
|
187
|
+
start_position=entry_info.start,end_position=entry_info.end,score=prot_prob,strand=entry_info.strand,frame=nil,attributes=prot_attributes)
|
188
|
+
prot_gff_line
|
189
|
+
end
|
190
|
+
|
191
|
+
def get_dna_sequence(protein_info,genomedb)
|
192
|
+
|
193
|
+
scaffold_sequence = get_fasta_record(protein_info.scaffold,genomedb)
|
194
|
+
gene_sequence = scaffold_sequence.naseq.to_s[(protein_info.start-1)..protein_info.end]
|
195
|
+
|
196
|
+
if ( protein_info.strand == "-")
|
197
|
+
gene_sequence = Bio::Sequence::NA.new(gene_sequence).reverse_complement
|
198
|
+
end
|
199
|
+
|
200
|
+
gene_sequence
|
201
|
+
end
|
202
|
+
|
203
|
+
def peptide_is_in_sixframe(pep_seq,gene_seq)
|
204
|
+
gs=Bio::Sequence::NA.new(gene_seq)
|
205
|
+
(1..6).each do |frame|
|
206
|
+
if gs.translate(frame).index(pep_seq)
|
207
|
+
return true
|
208
|
+
end
|
209
|
+
end
|
210
|
+
return false
|
211
|
+
end
|
212
|
+
|
213
|
+
# gene_seq should already have been reverse_complemented if on reverse strand
|
214
|
+
def get_peptide_coordinates_by_alignment(prot_seq,pep_seq,protein_info,gene_seq)
|
215
|
+
if ( peptide_is_in_sixframe(pep_seq,gene_seq))
|
216
|
+
return nil
|
217
|
+
else
|
218
|
+
puts "Warning. Actually found a gap #{protein_info.fasta_id}"
|
219
|
+
aln=GappedAligner.new().align(pep_seq,gene_seq)
|
220
|
+
unless aln.gaps.length==1
|
221
|
+
puts "More than one intron.#{aln}"
|
222
|
+
require 'debugger';debugger
|
223
|
+
end
|
224
|
+
pep_coords = []
|
225
|
+
frags = aln.fragments
|
226
|
+
frags.reverse! if protein_info.strand=='-'
|
227
|
+
|
228
|
+
frags.each { |frag|
|
229
|
+
if protein_info.strand=='+'
|
230
|
+
frag_genomic_start = protein_info.start + frag[0]
|
231
|
+
frag_genomic_end = protein_info.start + frag[1]
|
232
|
+
else
|
233
|
+
frag_genomic_start = protein_info.end - frag[1]
|
234
|
+
frag_genomic_end = protein_info.end - frag[0]
|
235
|
+
end
|
236
|
+
pep_coords << frag_genomic_start
|
237
|
+
pep_coords << frag_genomic_end
|
238
|
+
}
|
239
|
+
|
240
|
+
return [pep_coords]
|
241
|
+
end
|
242
|
+
end
|
243
|
+
|
244
|
+
def fragment_coords_from_protein_coords(pepstart,pepend,gene_start,gene_end,coding_sequences)
|
245
|
+
|
246
|
+
sorted_cds = coding_sequences.sort { |a, b| a[0] <=> b[0] }
|
247
|
+
|
248
|
+
# Assume positive strand
|
249
|
+
pi_start=pepstart*3+gene_start-1
|
250
|
+
pi_end=pepend*3+gene_start-1
|
251
|
+
|
252
|
+
fragments=[]
|
253
|
+
p_i = pi_start #Initially we are looking for the first fragment
|
254
|
+
finding_start=true
|
255
|
+
|
256
|
+
sorted_cds.each_with_index do |cds_coords, i|
|
257
|
+
cds_start=cds_coords[0]
|
258
|
+
cds_end = cds_coords[1]
|
259
|
+
if cds_end < p_i # Exon is before index in sequence and doesn't contain p_i
|
260
|
+
if sorted_cds.length <= i+1
|
261
|
+
require 'debugger';debugger
|
262
|
+
end
|
263
|
+
|
264
|
+
next_coords = sorted_cds[i+1]
|
265
|
+
intron_offset = ((next_coords[0]-cds_end)-1)
|
266
|
+
p_i+=intron_offset
|
267
|
+
pi_end+=intron_offset
|
268
|
+
if !finding_start
|
269
|
+
# This is a middle exon
|
270
|
+
fragments << [cds_start,cds_end]
|
271
|
+
end
|
272
|
+
else
|
273
|
+
if finding_start
|
274
|
+
fragments << [p_i+1,(cds_end)]
|
275
|
+
next_coords = sorted_cds[i+1]
|
276
|
+
intron_offset = ((next_coords[0]-cds_end)-1)
|
277
|
+
p_i+=intron_offset
|
278
|
+
pi_end+=intron_offset
|
279
|
+
p_i = pi_end
|
280
|
+
finding_start=false
|
281
|
+
else # A terminal exon
|
282
|
+
# require 'debugger';debugger
|
283
|
+
fragments << [(cds_start),(p_i)]
|
284
|
+
break;
|
285
|
+
end
|
286
|
+
end
|
287
|
+
end
|
288
|
+
[fragments]
|
289
|
+
end
|
290
|
+
|
291
|
+
# gene_seq should already have been reverse_complemented if on reverse strand
|
292
|
+
def get_peptide_coordinates_from_transcript_info(prot_seq,pep_seq,protein_info,gene_seq)
|
293
|
+
if ( peptide_is_in_sixframe(pep_seq,gene_seq))
|
294
|
+
return nil
|
295
|
+
else
|
296
|
+
|
297
|
+
# puts "Found a gap #{protein_info.fasta_id}"
|
298
|
+
if protein_info.strand=='-'
|
299
|
+
pep_index = prot_seq.reverse.index(pep_seq.reverse)
|
300
|
+
if pep_index==nil
|
301
|
+
# require 'debugger';debugger
|
302
|
+
puts "Warning: Unable to find peptide #{pep_seq} in this protein! #{protein_info}"
|
303
|
+
return nil
|
304
|
+
end
|
305
|
+
pep_start_i = prot_seq.reverse.index(pep_seq.reverse)+1
|
306
|
+
# Plus 1 because on reverse stand stop-codon will be at the beginning of the sequence (when read forwards). Need to eliminate it.
|
307
|
+
else
|
308
|
+
pep_start_i = prot_seq.index(pep_seq)
|
309
|
+
if pep_start_i==nil
|
310
|
+
# require 'debugger';debugger
|
311
|
+
puts "Warning: Unable to find peptide #{pep_seq} in this protein! #{protein_info}"
|
312
|
+
return nil
|
313
|
+
end
|
314
|
+
end
|
315
|
+
pep_end_i = pep_start_i+pep_seq.length
|
316
|
+
|
317
|
+
return fragment_coords_from_protein_coords(pep_start_i,pep_end_i,protein_info.start,protein_info.end,protein_info.coding_sequences)
|
318
|
+
end
|
319
|
+
end
|
320
|
+
|
321
|
+
def get_peptide_coordinates_sixframe(prot_seq,pep_seq,protein_info)
|
322
|
+
|
323
|
+
if ( protein_info.strand == '-' )
|
324
|
+
prot_seq = prot_seq.reverse
|
325
|
+
pep_seq = pep_seq.reverse
|
326
|
+
end
|
327
|
+
|
328
|
+
start_indexes = [0]
|
329
|
+
|
330
|
+
prot_seq.scan /#{pep_seq}/ do |match|
|
331
|
+
start_indexes << prot_seq.index(match,start_indexes.last)
|
332
|
+
end
|
333
|
+
start_indexes.delete_at(0)
|
334
|
+
|
335
|
+
start_indexes.collect do |si|
|
336
|
+
pep_genomic_start = protein_info.start + 3*si
|
337
|
+
pep_genomic_end = pep_genomic_start + 3*pep_seq.length - 1
|
338
|
+
[[pep_genomic_start,pep_genomic_end]]
|
339
|
+
end
|
340
|
+
|
341
|
+
end
|
342
|
+
|
343
|
+
# Returns a 4-mer [genomic_start,fragment1_end(or0),frag2_start(or0),genomic_end]
|
344
|
+
def get_peptide_coordinates(prot_seq,pep_seq,protein_info,gene_seq)
|
345
|
+
if ( protein_info.is_sixframe)
|
346
|
+
return get_peptide_coordinates_sixframe(prot_seq,pep_seq,protein_info)
|
347
|
+
else
|
348
|
+
return get_peptide_coordinates_from_transcript_info(prot_seq,pep_seq,protein_info,gene_seq)
|
349
|
+
end
|
350
|
+
end
|
351
|
+
|
352
|
+
|
353
|
+
def generate_fragment_gffs_for_coords(coords,protein_info,pep_id,peptide_seq,genomedb,name="fragment")
|
354
|
+
scaff = get_fasta_record(protein_info.scaffold,genomedb)
|
355
|
+
scaffold_seq = Bio::Sequence::NA.new(scaff.seq)
|
356
|
+
|
357
|
+
fragment_phase = 0
|
358
|
+
ordered_coords= protein_info.strand=='+' ? coords : coords.reverse
|
359
|
+
if name=="CDS"
|
360
|
+
frag_id="#{pep_id}.fg"
|
361
|
+
else
|
362
|
+
frag_id="#{pep_id}.sp"
|
363
|
+
end
|
364
|
+
gff_lines = ordered_coords.collect do |frag_start,frag_end|
|
365
|
+
frag_naseq = scaffold_seq[frag_start-1..frag_end-1]
|
366
|
+
|
367
|
+
begin
|
368
|
+
frag_frame = fragment_phase+1
|
369
|
+
frag_seq = nil
|
370
|
+
if ( protein_info.strand=='-')
|
371
|
+
frag_seq = frag_naseq.reverse_complement.translate(frag_frame)
|
372
|
+
else
|
373
|
+
frag_seq = frag_naseq.translate(frag_frame)
|
374
|
+
end
|
375
|
+
rescue
|
376
|
+
if frag_naseq.length > 1
|
377
|
+
puts "Unable to translate #{frag_naseq}"
|
378
|
+
# require 'debugger'
|
379
|
+
end
|
380
|
+
frag_seq="*"
|
381
|
+
end
|
382
|
+
|
383
|
+
fragment_record=Bio::GFF::GFF3::Record.new(seqid = protein_info.scaffold,source="MSMS",
|
384
|
+
feature_type=name,start_position=frag_start,end_position=frag_end,score='',
|
385
|
+
strand=protein_info.strand,frame=fragment_phase,attributes=[["Parent",pep_id],["ID",frag_id],["Name",frag_seq]])
|
386
|
+
|
387
|
+
|
388
|
+
remainder=(frag_naseq.length-fragment_phase) % 3
|
389
|
+
fragment_phase=(3-remainder) % 3
|
390
|
+
|
391
|
+
fragment_record
|
392
|
+
end
|
393
|
+
|
394
|
+
|
395
|
+
concat_seq=nil
|
396
|
+
|
397
|
+
coords.each do |frag_start,frag_end|
|
398
|
+
frag_naseq = scaffold_seq[frag_start-1..frag_end-1]
|
399
|
+
concat_seq += frag_naseq unless concat_seq == nil
|
400
|
+
concat_seq = frag_naseq if concat_seq==nil
|
401
|
+
end
|
402
|
+
|
403
|
+
check_seq = protein_info.strand=='-' ? concat_seq.reverse_complement.translate : concat_seq.translate
|
404
|
+
if ( check_seq != peptide_seq)
|
405
|
+
require 'debugger';debugger
|
406
|
+
puts "Fragment seqs not equal to peptide seqs"
|
407
|
+
end
|
408
|
+
|
409
|
+
return gff_lines
|
410
|
+
|
411
|
+
end
|
412
|
+
|
413
|
+
def get_start_codon_coords_for_peptide(peptide_genomic_start,peptide_genomic_end,peptide_seq,protein_seq,strand)
|
414
|
+
pi=protein_seq.index(peptide_seq)
|
415
|
+
if ( protein_seq[pi]=='M' )
|
416
|
+
is_tryptic=false
|
417
|
+
if ( pi>0 && (protein_seq[pi-1]!='K' && protein_seq[pi-1]!='R') )
|
418
|
+
is_tryptic=true
|
419
|
+
elsif (pi==0)
|
420
|
+
is_tryptic=true
|
421
|
+
end
|
422
|
+
return nil unless is_tryptic
|
423
|
+
|
424
|
+
start_codon_coord = (strand=='+') ? peptide_genomic_start : peptide_genomic_end-1
|
425
|
+
# require 'debugger';debugger
|
426
|
+
return [start_codon_coord,start_codon_coord+2]
|
427
|
+
else
|
428
|
+
return nil
|
429
|
+
end
|
430
|
+
end
|
431
|
+
|
432
|
+
def get_cterm_coords_for_peptide(peptide_genomic_start,peptide_genomic_end,peptide_seq,protein_seq,strand)
|
433
|
+
|
434
|
+
if ( (peptide_seq[-1]!='K' && peptide_seq[-1]!='R' ) )
|
435
|
+
|
436
|
+
codon_coord = (strand=='+') ? peptide_genomic_end-3 : peptide_genomic_start+1
|
437
|
+
# require 'debugger';debugger
|
438
|
+
return [codon_coord,codon_coord+2]
|
439
|
+
else
|
440
|
+
return nil
|
441
|
+
end
|
442
|
+
end
|
443
|
+
|
444
|
+
|
445
|
+
def get_signal_peptide_for_peptide(peptide_seq,protein_seq)
|
446
|
+
pi=protein_seq.index(peptide_seq)
|
447
|
+
if ( pi>0 && (protein_seq[pi-1]!='K' && protein_seq[pi-1]!='R' && protein_seq[pi]!='M') )
|
448
|
+
reverse_leader_seq=protein_seq[0..pi].reverse
|
449
|
+
mi=reverse_leader_seq.index('M')
|
450
|
+
|
451
|
+
if ( mi==nil )
|
452
|
+
puts "No methionine found ahead of peptide sequence. Unable to determine signal peptide sequence"
|
453
|
+
return nil
|
454
|
+
end
|
455
|
+
|
456
|
+
mi=pi-mi
|
457
|
+
|
458
|
+
return protein_seq[mi..(pi-1)]
|
459
|
+
else
|
460
|
+
return nil
|
461
|
+
end
|
462
|
+
end
|
463
|
+
|
464
|
+
def generate_gff_for_peptide_mapped_to_protein(protein_seq,peptide_seq,protein_info,prot_id,peptide_prob,peptide_count,genomedb=nil)
|
465
|
+
|
466
|
+
dna_sequence=nil
|
467
|
+
if !protein_info.is_sixframe
|
468
|
+
throw "A genome is required if predicted transcripts are to be mapped" unless genomedb!=nil
|
469
|
+
dna_sequence = get_dna_sequence(protein_info,genomedb)
|
470
|
+
end
|
471
|
+
|
472
|
+
prot_seq = protein_seq
|
473
|
+
pep_seq = peptide_seq
|
474
|
+
|
475
|
+
|
476
|
+
peptide_coords = get_peptide_coordinates(prot_seq,pep_seq,protein_info,dna_sequence)
|
477
|
+
|
478
|
+
if ( peptide_coords==nil ) # Return value of nil means the entry is a predicted transcript that should already be covered by 6-frame
|
479
|
+
return []
|
480
|
+
end
|
481
|
+
|
482
|
+
gff_records=[]
|
483
|
+
|
484
|
+
# Now convert peptide coordinate to genome coordinates
|
485
|
+
# And create gff lines for each match
|
486
|
+
peptide_coords.each do |coords|
|
487
|
+
|
488
|
+
# require 'debugger';debugger
|
489
|
+
pep_genomic_start = coords.first[0]
|
490
|
+
pep_genomic_end = coords.last[1]
|
491
|
+
|
492
|
+
pep_id = "#{prot_id}.p#{peptide_count.to_s}"
|
493
|
+
pep_attributes = [["ID",pep_id],["Parent",prot_id],["Name",pep_seq]]
|
494
|
+
|
495
|
+
pep_gff_line = Bio::GFF::GFF3::Record.new(seqid = protein_info.scaffold,source="MSMS",
|
496
|
+
feature_type="peptide",start_position=pep_genomic_start,end_position=pep_genomic_end,score=peptide_prob,
|
497
|
+
strand=protein_info.strand,frame=nil,attributes=pep_attributes)
|
498
|
+
|
499
|
+
# For standard peptides
|
500
|
+
frag_gffs = generate_fragment_gffs_for_coords(coords,protein_info,pep_id,peptide_seq,genomedb,"CDS")
|
501
|
+
gff_records += [pep_gff_line] + frag_gffs
|
502
|
+
# require 'debugger';debugger
|
503
|
+
# For peptides with only 1 tryptic terminus
|
504
|
+
start_codon_coords=get_start_codon_coords_for_peptide(pep_genomic_start,pep_genomic_end,peptide_seq,protein_seq,protein_info.strand)
|
505
|
+
if start_codon_coords
|
506
|
+
start_codon_gff = Bio::GFF::GFF3::Record.new(seqid = protein_info.scaffold,source="MSMS",
|
507
|
+
feature_type="start_codon",start_position=start_codon_coords[0],end_position=start_codon_coords[1],score='',
|
508
|
+
strand=protein_info.strand,frame=nil,attributes=["Parent",pep_id])
|
509
|
+
gff_records+=[start_codon_gff]
|
510
|
+
end
|
511
|
+
|
512
|
+
cterm_coords = get_cterm_coords_for_peptide(pep_genomic_start,pep_genomic_end,peptide_seq,protein_seq,protein_info.strand)
|
513
|
+
if ( cterm_coords )
|
514
|
+
cterm_gff = Bio::GFF::GFF3::Record.new(seqid = protein_info.scaffold,source="MSMS",
|
515
|
+
feature_type="cterm",start_position=cterm_coords[0],end_position=cterm_coords[1],score='',
|
516
|
+
strand=protein_info.strand,frame=nil,attributes=["Parent",pep_id])
|
517
|
+
gff_records+=[start_codon_gff]
|
518
|
+
end
|
519
|
+
|
520
|
+
signal_peptide = get_signal_peptide_for_peptide(peptide_seq,protein_seq)
|
521
|
+
if signal_peptide
|
522
|
+
# require 'debugger';debugger
|
523
|
+
|
524
|
+
signal_peptide_coords=get_peptide_coordinates(prot_seq,signal_peptide,protein_info,dna_sequence)
|
525
|
+
if signal_peptide_coords
|
526
|
+
signal_peptide_coords.each do |spcoords|
|
527
|
+
signal_peptide_gff = generate_fragment_gffs_for_coords(spcoords,protein_info,pep_id,signal_peptide,genomedb,"signalpeptide")
|
528
|
+
gff_records += signal_peptide_gff
|
529
|
+
end
|
530
|
+
end
|
531
|
+
end
|
532
|
+
|
533
|
+
|
534
|
+
end
|
535
|
+
puts gff_records
|
536
|
+
|
537
|
+
gff_records
|
538
|
+
end
|
539
|
+
|
540
|
+
proteins = parse_proteins(tool.protxml)
|
541
|
+
fastadb = prepare_fasta(tool.database,'prot')
|
542
|
+
genomedb = nil
|
543
|
+
if tool.genome
|
544
|
+
genomedb = prepare_fasta(tool.genome,'nucl')
|
545
|
+
end
|
546
|
+
|
547
|
+
puts "Aligning peptides and writing GFF data..."
|
548
|
+
|
549
|
+
low_prob = 0
|
550
|
+
skipped = 0
|
551
|
+
peptide_count = 0
|
552
|
+
protein_count = 0
|
553
|
+
total_peptides = 0
|
554
|
+
|
555
|
+
for prot in proteins
|
556
|
+
prot_prob = prot['probability']
|
557
|
+
if ( prot_prob.to_f < tool.peptide_probability_threshold )
|
558
|
+
next
|
559
|
+
end
|
560
|
+
|
561
|
+
# Gets identifiers of all proteins (includeing indistinguishable ones)
|
562
|
+
prot_names=protein_names(prot)
|
563
|
+
|
564
|
+
peptides=peptide_nodes(prot)
|
565
|
+
entries_covered=[]
|
566
|
+
for protein_name in prot_names
|
567
|
+
protein_count += 1
|
568
|
+
prot_id = "pr#{protein_count.to_s}"
|
569
|
+
begin
|
570
|
+
|
571
|
+
protein_fasta_entry = get_fasta_record(protein_name,fastadb)
|
572
|
+
protein_info = cds_info_from_fasta(protein_fasta_entry)
|
573
|
+
|
574
|
+
if is_new_genome_location(protein_info,entries_covered)
|
575
|
+
|
576
|
+
protein_gff = generate_protein_gff(protein_name,protein_info,prot_prob,protein_count)
|
577
|
+
|
578
|
+
gff_db.records += ["##gff-version 3\n","##sequence-region #{protein_info.scaffold} 1 160\n",protein_gff]
|
579
|
+
|
580
|
+
prot_seq = protein_fasta_entry.aaseq.to_s
|
581
|
+
throw "Not amino_acids" if prot_seq != protein_fasta_entry.seq.to_s
|
582
|
+
|
583
|
+
peptide_count=1
|
584
|
+
for peptide in peptides
|
585
|
+
pprob = peptide['nsp_adjusted_probability'].to_f
|
586
|
+
if ( pprob >= tool.peptide_probability_threshold )
|
587
|
+
total_peptides += 1
|
588
|
+
pep_seq = peptide['peptide_sequence']
|
589
|
+
|
590
|
+
gff_db.records += generate_gff_for_peptide_mapped_to_protein(prot_seq,pep_seq,protein_info,prot_id,pprob,peptide_count,genomedb)
|
591
|
+
peptide_count+=1
|
592
|
+
end
|
593
|
+
end
|
594
|
+
else
|
595
|
+
puts "Skipping redundant entry #{protein_name}"
|
596
|
+
protein_count-=1 # To counter +1 prior to begin rescue end block
|
597
|
+
end
|
598
|
+
|
599
|
+
entries_covered<<protein_info
|
600
|
+
|
601
|
+
# puts protein_gff
|
602
|
+
# puts gff_db.records
|
603
|
+
rescue KeyError,EncodingError
|
604
|
+
skipped+=0
|
605
|
+
end
|
606
|
+
|
607
|
+
# exit
|
608
|
+
end
|
609
|
+
|
610
|
+
end
|
611
|
+
|
612
|
+
f = open(gff_out_file,'w+')
|
613
|
+
gff_db.records.each { |rec|
|
614
|
+
f.write(rec.to_s)
|
615
|
+
}
|
616
|
+
f.close
|
617
|
+
|
618
|
+
p "Finished."
|
619
|
+
p "Proteins: #{protein_count}"
|
620
|
+
p "Skipped Decoys: #{skipped}"
|
621
|
+
p "Total Peptides: #{total_peptides}"
|
622
|
+
p "Peptides Written: #{total_peptides - low_prob}"
|
623
|
+
p "Peptides Culled: #{low_prob}"
|
624
|
+
exit(0)
|