protk 1.2.6.pre5 → 1.3.0.pre1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/README.md +84 -45
- data/bin/add_retention_times.rb +9 -5
- data/bin/augustus_to_proteindb.rb +7 -11
- data/bin/interprophet.rb +28 -46
- data/bin/make_decoy.rb +16 -48
- data/bin/mascot_search.rb +57 -71
- data/bin/mascot_to_pepxml.rb +13 -26
- data/bin/msgfplus_search.rb +70 -107
- data/bin/omssa_search.rb +52 -109
- data/bin/peptide_prophet.rb +44 -119
- data/bin/pepxml_to_table.rb +24 -27
- data/bin/protein_prophet.rb +22 -82
- data/bin/protxml_to_gff.rb +22 -519
- data/bin/protxml_to_table.rb +2 -16
- data/bin/sixframe.rb +10 -32
- data/bin/tandem_search.rb +30 -403
- data/bin/tandem_to_pepxml.rb +43 -0
- data/bin/unimod_to_loc.rb +1 -1
- data/ext/{protk/decoymaker → decoymaker}/decoymaker.c +74 -21
- data/ext/decoymaker/extconf.rb +3 -0
- data/lib/protk/constants.rb +16 -2
- data/lib/protk/data/default_config.yml +2 -1
- data/lib/protk/data/tandem_gpm_defaults.xml +175 -0
- data/lib/protk/data/tandem_isb_kscore_defaults.xml +123 -0
- data/lib/protk/data/tandem_isb_native_defaults.xml +123 -0
- data/lib/protk/data/tandem_params.xml +17 -54
- data/lib/protk/fastadb.rb +2 -2
- data/lib/protk/prophet_tool.rb +1 -1
- data/lib/protk/protxml_to_gff_tool.rb +474 -0
- data/lib/protk/search_tool.rb +58 -103
- data/lib/protk/setup_rakefile.rake +9 -5
- data/lib/protk/tandem_search_tool.rb +256 -0
- data/lib/protk/tool.rb +85 -104
- data/lib/protk.rb +1 -6
- metadata +24 -103
- data/bin/annotate_ids.rb +0 -59
- data/bin/asapratio.rb +0 -27
- data/bin/blastxml_to_table.rb +0 -119
- data/bin/correct_omssa_retention_times.rb +0 -27
- data/bin/feature_finder.rb +0 -95
- data/bin/file_convert.rb +0 -164
- data/bin/generate_omssa_loc.rb +0 -42
- data/bin/gffmerge.rb +0 -208
- data/bin/libra.rb +0 -70
- data/bin/toppas_pipeline.rb +0 -84
- data/bin/uniprot_annotation.rb +0 -141
- data/bin/xls_to_table.rb +0 -52
- data/bin/xpress.rb +0 -27
- data/ext/protk/decoymaker/extconf.rb +0 -3
- data/ext/protk/simplealign/extconf.rb +0 -3
- data/lib/protk/biotools_excel_converter.rb +0 -60
- data/lib/protk/eupathdb_gene_information_table.rb +0 -158
- data/lib/protk/gapped_aligner.rb +0 -264
- data/lib/protk/protein_annotator.rb +0 -646
- data/lib/protk/spreadsheet_extensions.rb +0 -79
- data/lib/protk/xtandem_defaults.rb +0 -11
data/bin/protxml_to_gff.rb
CHANGED
@@ -7,73 +7,26 @@
|
|
7
7
|
#
|
8
8
|
|
9
9
|
require 'protk/constants'
|
10
|
-
require 'protk/
|
10
|
+
require 'protk/protxml_to_gff_tool'
|
11
11
|
require 'protk/fastadb'
|
12
|
-
require 'protk/gapped_aligner'
|
13
12
|
require 'libxml'
|
14
13
|
require 'bio'
|
15
14
|
|
16
15
|
include LibXML
|
17
16
|
|
18
|
-
tool=
|
19
|
-
tool.option_parser.banner = "Create a gff containing peptide Observations.\n\nUsage: protxml_to_gff.rb "
|
17
|
+
tool=ProtXMLToGFFTool.new()
|
20
18
|
|
19
|
+
@output_extension=".gff"
|
20
|
+
@output_suffix=""
|
21
21
|
|
22
|
-
tool.
|
23
|
-
tool.option_parser.on( '-p filename','--protxml filename', 'Observed Data (ProtXML Format)' ) do |file|
|
24
|
-
tool.options.protxml=file
|
25
|
-
end
|
26
|
-
|
27
|
-
tool.options.database=nil
|
28
|
-
tool.option_parser.on( '-d filename','--database filename', 'Database used for ms/ms searches (Fasta Format)' ) do |file|
|
29
|
-
tool.options.database=file
|
30
|
-
end
|
22
|
+
exit unless tool.check_options(true,[:database])
|
31
23
|
|
32
|
-
|
33
|
-
tool.option_parser.on( '-f term','--find term', 'Restrict output to proteins whose name matches the specified string' ) do |term|
|
34
|
-
tool.options.protein_find=term
|
35
|
-
end
|
36
|
-
|
37
|
-
tool.options.nterm_minlen=7
|
38
|
-
tool.option_parser.on( '-n len','--nterm-min-len len', 'Only include inferred N-terminal sequences if longer than len' ) do |len|
|
39
|
-
tool.options.nterm_minlen=len
|
40
|
-
end
|
24
|
+
input_proxml=ARGV[0]
|
41
25
|
|
42
|
-
tool.
|
43
|
-
tool.
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
tool.options.skip_fasta_indexing=false
|
48
|
-
tool.option_parser.on('--skip-index','Don\'t index database (Index should already exist)') do
|
49
|
-
tool.options.skip_fasta_indexing=true
|
50
|
-
end
|
51
|
-
|
52
|
-
tool.options.stack_charge_states=false
|
53
|
-
tool.option_parser.on('--stack-charge-states','Different peptide charge states get separate gff entries') do
|
54
|
-
tool.options.stack_charge_states=true
|
55
|
-
end
|
56
|
-
|
57
|
-
tool.options.collapse_redundant_proteins=false
|
58
|
-
tool.option_parser.on('--collapse-redundant-proteins','Proteins that cover genomic regions already covered will be skipped') do
|
59
|
-
tool.options.collapse_redundant_proteins=true
|
60
|
-
end
|
61
|
-
|
62
|
-
tool.options.peptide_probability_threshold=0.95
|
63
|
-
tool.option_parser.on('--threshold prob','Peptide Probability Threshold (Default 0.95)') do |thresh|
|
64
|
-
tool.options.peptide_probability_threshold=thresh.to_f
|
65
|
-
end
|
66
|
-
|
67
|
-
tool.options.protein_probability_threshold=0.99
|
68
|
-
tool.option_parser.on('--prot-threshold prob','Protein Probability Threshold (Default 0.99)') do |thresh|
|
69
|
-
tool.options.protein_probability_threshold=thresh.to_f
|
70
|
-
end
|
71
|
-
|
72
|
-
exit unless tool.check_options [:protxml,:database]
|
73
|
-
|
74
|
-
gff_out_file="peptides.gff"
|
75
|
-
if ( tool.explicit_output != nil)
|
76
|
-
gff_out_file=tool.explicit_output
|
26
|
+
if ( tool.explicit_output!=nil)
|
27
|
+
gff_out_file=tool.explicit_output
|
28
|
+
else
|
29
|
+
gff_out_file=Tool.default_output_path(input_proxml,@output_extension,tool.output_prefix,@output_suffix)
|
77
30
|
end
|
78
31
|
|
79
32
|
gff_db = Bio::GFF.new()
|
@@ -92,7 +45,7 @@ def prepare_fasta(database_path,type)
|
|
92
45
|
db_filename = nil
|
93
46
|
case
|
94
47
|
when Pathname.new(database_path).exist? # It's an explicitly named db
|
95
|
-
db_filename = Pathname.new(database_path).
|
48
|
+
db_filename = Pathname.new(database_path).expand_path.to_s
|
96
49
|
else
|
97
50
|
db_filename=Constants.new.current_database_for_name(database_path)
|
98
51
|
end
|
@@ -109,457 +62,7 @@ def prepare_fasta(database_path,type)
|
|
109
62
|
orf_lookup
|
110
63
|
end
|
111
64
|
|
112
|
-
|
113
|
-
indis_proteins = protein_node.find('protxml:indistinguishable_protein','protxml:http://regis-web.systemsbiology.net/protXML')
|
114
|
-
prot_names = [protein_node['protein_name']]
|
115
|
-
for protein in indis_proteins
|
116
|
-
prot_names += [protein['protein_name']]
|
117
|
-
end
|
118
|
-
prot_names
|
119
|
-
end
|
120
|
-
|
121
|
-
def peptide_nodes(protein_node)
|
122
|
-
return protein_node.find('protxml:peptide','protxml:http://regis-web.systemsbiology.net/protXML')
|
123
|
-
end
|
124
|
-
|
125
|
-
|
126
|
-
def get_fasta_record(protein_name,fastadb)
|
127
|
-
# puts "Looking up #{protein_name}"
|
128
|
-
entry = fastadb.get_by_id protein_name
|
129
|
-
if ( entry == nil)
|
130
|
-
puts "Failed lookup for #{protein_name}"
|
131
|
-
raise KeyError
|
132
|
-
end
|
133
|
-
entry
|
134
|
-
end
|
135
|
-
|
136
|
-
class CDSInfo
|
137
|
-
attr_accessor :fasta_id
|
138
|
-
attr_accessor :strand
|
139
|
-
attr_accessor :frame
|
140
|
-
attr_accessor :name
|
141
|
-
attr_accessor :scaffold
|
142
|
-
attr_accessor :start
|
143
|
-
attr_accessor :end
|
144
|
-
attr_accessor :coding_sequences
|
145
|
-
attr_accessor :is_sixframe
|
146
|
-
attr_accessor :gene_id
|
147
|
-
|
148
|
-
def overlap(candidate_entry)
|
149
|
-
return false if candidate_entry.scaffold!=self.scaffold
|
150
|
-
return false if strand!=self.strand
|
151
|
-
return false if candidate_entry.start >= self.end
|
152
|
-
return false if self.start <= candidate_entry.end
|
153
|
-
return true
|
154
|
-
end
|
155
|
-
|
156
|
-
end
|
157
|
-
|
158
|
-
def cds_info_from_fasta(fasta_entry)
|
159
|
-
info=CDSInfo.new
|
160
|
-
info.fasta_id=fasta_entry
|
161
|
-
positions = fasta_entry.identifiers.description.split(' ').collect { |coords| coords.split('|').collect {|pos| pos.to_i} }
|
162
|
-
info.coding_sequences=[]
|
163
|
-
info.gene_id
|
164
|
-
if ( positions.length < 1 )
|
165
|
-
raise EncodingError
|
166
|
-
elsif ( positions.length > 1)
|
167
|
-
info.coding_sequences = positions[1..-1]
|
168
|
-
end
|
169
|
-
|
170
|
-
info.start = positions[0][0]
|
171
|
-
info.end = positions[0][1]
|
172
|
-
|
173
|
-
info.scaffold=fasta_entry.entry_id.scan(/(scaffold_?\d+)_/)[0][0]
|
174
|
-
info.name = fasta_entry.entry_id.scan(/lcl\|(.*)/)[0][0]
|
175
|
-
|
176
|
-
if fasta_entry.entry_id =~ /frame/
|
177
|
-
info.frame=info.name.scan(/frame_(\d)/)[0][0]
|
178
|
-
info.strand = (info.frame.to_i > 3) ? '-' : '+'
|
179
|
-
info.is_sixframe = true
|
180
|
-
else
|
181
|
-
info.strand = (info.name =~ /rev/) ? '-' : '+'
|
182
|
-
info.gene_id=info.name.scan(/_\w{3}_(.*)\.t/)[0][0]
|
183
|
-
info.is_sixframe = false
|
184
|
-
end
|
185
|
-
info
|
186
|
-
end
|
187
|
-
|
188
|
-
|
189
|
-
def is_new_genome_location(candidate_entry,existing_entries)
|
190
|
-
# puts existing_entries
|
191
|
-
# require 'debugger';debugger
|
192
|
-
|
193
|
-
# genes=existing_entries.collect { |e| e.gene_id }.compact
|
194
|
-
|
195
|
-
# if genes.include?(candidate_entry.gene_id)
|
196
|
-
# return false
|
197
|
-
# end
|
198
|
-
|
199
|
-
existing_entries.each do |existing|
|
200
|
-
return false if existing.gene_id==candidate_entry.gene_id
|
201
|
-
return false if existing.overlap(candidate_entry)
|
202
|
-
end
|
203
|
-
|
204
|
-
return true
|
205
|
-
end
|
206
|
-
|
207
|
-
|
208
|
-
def generate_protein_gff(protein_name,entry_info,prot_prob,prot_id)
|
209
|
-
prot_qualifiers = {"source" => "MSMS", "score" => prot_prob, "ID" => prot_id}
|
210
|
-
prot_attributes = [["ID",prot_id],["Name",entry_info.name]]
|
211
|
-
prot_gff_line = Bio::GFF::GFF3::Record.new(seqid = entry_info.scaffold,source="MSMS",feature_type="protein",
|
212
|
-
start_position=entry_info.start,end_position=entry_info.end,score=prot_prob,strand=entry_info.strand,frame=nil,attributes=prot_attributes)
|
213
|
-
prot_gff_line
|
214
|
-
end
|
215
|
-
|
216
|
-
def get_dna_sequence(protein_info,genomedb)
|
217
|
-
|
218
|
-
scaffold_sequence = get_fasta_record(protein_info.scaffold,genomedb)
|
219
|
-
gene_sequence = scaffold_sequence.naseq.to_s[(protein_info.start-1)..protein_info.end]
|
220
|
-
|
221
|
-
if ( protein_info.strand == "-")
|
222
|
-
gene_sequence = Bio::Sequence::NA.new(gene_sequence).reverse_complement
|
223
|
-
end
|
224
|
-
|
225
|
-
gene_sequence
|
226
|
-
end
|
227
|
-
|
228
|
-
def peptide_is_in_sixframe(pep_seq,gene_seq)
|
229
|
-
gs=Bio::Sequence::NA.new(gene_seq)
|
230
|
-
(1..6).each do |frame|
|
231
|
-
if gs.translate(frame).index(pep_seq)
|
232
|
-
return true
|
233
|
-
end
|
234
|
-
end
|
235
|
-
return false
|
236
|
-
end
|
237
|
-
|
238
|
-
def fragment_coords_from_protein_coords(pepstart,pepend,gene_start,gene_end,coding_sequences)
|
239
|
-
|
240
|
-
sorted_cds = coding_sequences.sort { |a, b| a[0] <=> b[0] }
|
241
|
-
|
242
|
-
|
243
|
-
# Assume positive strand
|
244
|
-
pi_start=pepstart*3+gene_start-1
|
245
|
-
pi_end=pepend*3+gene_start-1
|
246
|
-
|
247
|
-
fragments=[]
|
248
|
-
p_i = pi_start #Initially we are looking for the first fragment
|
249
|
-
finding_start=true
|
250
|
-
|
251
|
-
sorted_cds.each_with_index do |cds_coords, i|
|
252
|
-
cds_start=cds_coords[0]
|
253
|
-
cds_end = cds_coords[1]
|
254
|
-
if cds_end < p_i # Exon is before index in sequence and doesn't contain p_i
|
255
|
-
if sorted_cds.length <= i+1
|
256
|
-
require 'debugger';debugger
|
257
|
-
end
|
258
|
-
|
259
|
-
next_coords = sorted_cds[i+1]
|
260
|
-
intron_offset = ((next_coords[0]-cds_end)-1)
|
261
|
-
p_i+=intron_offset
|
262
|
-
pi_end+=intron_offset
|
263
|
-
if !finding_start
|
264
|
-
# This is a middle exon
|
265
|
-
fragments << [cds_start,cds_end]
|
266
|
-
end
|
267
|
-
else
|
268
|
-
if finding_start
|
269
|
-
|
270
|
-
if ( pi_end <= cds_end) #Whole peptide contained in a single exon
|
271
|
-
fragments << [p_i+1,pi_end]
|
272
|
-
break;
|
273
|
-
end
|
274
|
-
|
275
|
-
|
276
|
-
fragments << [p_i+1,(cds_end)]
|
277
|
-
next_coords = sorted_cds[i+1]
|
278
|
-
intron_offset = ((next_coords[0]-cds_end)-1)
|
279
|
-
p_i+=intron_offset
|
280
|
-
pi_end+=intron_offset
|
281
|
-
p_i = pi_end
|
282
|
-
finding_start=false
|
283
|
-
else # A terminal exon
|
284
|
-
# require 'debugger';debugger
|
285
|
-
fragments << [(cds_start),(p_i)]
|
286
|
-
break;
|
287
|
-
end
|
288
|
-
end
|
289
|
-
end
|
290
|
-
[fragments]
|
291
|
-
end
|
292
|
-
|
293
|
-
# gene_seq should already have been reverse_complemented if on reverse strand
|
294
|
-
def get_peptide_coordinates_from_transcript_info(prot_seq,pep_seq,protein_info,gene_seq)
|
295
|
-
# if ( peptide_is_in_sixframe(pep_seq,gene_seq))
|
296
|
-
# Peptide is in 6-frame but on a predicted transcript
|
297
|
-
# return nil
|
298
|
-
# else
|
299
|
-
|
300
|
-
# puts "Found a gap #{protein_info.fasta_id}"
|
301
|
-
if protein_info.strand=='-'
|
302
|
-
pep_index = prot_seq.reverse.index(pep_seq.reverse)
|
303
|
-
if pep_index==nil
|
304
|
-
# require 'debugger';debugger
|
305
|
-
puts "Warning: Unable to find peptide #{pep_seq} in this protein! #{protein_info}"
|
306
|
-
return nil
|
307
|
-
end
|
308
|
-
pep_start_i = prot_seq.reverse.index(pep_seq.reverse)+1
|
309
|
-
# Plus 1 because on reverse stand stop-codon will be at the beginning of the sequence (when read forwards). Need to eliminate it.
|
310
|
-
else
|
311
|
-
pep_start_i = prot_seq.index(pep_seq)
|
312
|
-
if pep_start_i==nil
|
313
|
-
# require 'debugger';debugger
|
314
|
-
puts "Warning: Unable to find peptide #{pep_seq} in this protein! #{protein_info}"
|
315
|
-
return nil
|
316
|
-
end
|
317
|
-
end
|
318
|
-
pep_end_i = pep_start_i+pep_seq.length
|
319
|
-
|
320
|
-
return fragment_coords_from_protein_coords(pep_start_i,pep_end_i,protein_info.start,protein_info.end,protein_info.coding_sequences)
|
321
|
-
# end
|
322
|
-
end
|
323
|
-
|
324
|
-
def get_peptide_coordinates_sixframe(prot_seq,pep_seq,protein_info)
|
325
|
-
|
326
|
-
if ( protein_info.strand == '-' )
|
327
|
-
prot_seq = prot_seq.reverse
|
328
|
-
pep_seq = pep_seq.reverse
|
329
|
-
end
|
330
|
-
|
331
|
-
start_indexes = [0]
|
332
|
-
|
333
|
-
prot_seq.scan /#{pep_seq}/ do |match|
|
334
|
-
start_indexes << prot_seq.index(match,start_indexes.last)
|
335
|
-
end
|
336
|
-
start_indexes.delete_at(0)
|
337
|
-
|
338
|
-
start_indexes.collect do |si|
|
339
|
-
pep_genomic_start = protein_info.start + 3*si
|
340
|
-
pep_genomic_end = pep_genomic_start + 3*pep_seq.length - 1
|
341
|
-
[[pep_genomic_start,pep_genomic_end]]
|
342
|
-
end
|
343
|
-
|
344
|
-
end
|
345
|
-
|
346
|
-
# Returns a 4-mer [genomic_start,fragment1_end(or0),frag2_start(or0),genomic_end]
|
347
|
-
def get_peptide_coordinates(prot_seq,pep_seq,protein_info,gene_seq)
|
348
|
-
if ( protein_info.is_sixframe)
|
349
|
-
return get_peptide_coordinates_sixframe(prot_seq,pep_seq,protein_info)
|
350
|
-
else
|
351
|
-
return get_peptide_coordinates_from_transcript_info(prot_seq,pep_seq,protein_info,gene_seq)
|
352
|
-
end
|
353
|
-
end
|
354
|
-
|
355
|
-
|
356
|
-
def generate_fragment_gffs_for_coords(coords,protein_info,pep_id,peptide_seq,genomedb,name="fragment")
|
357
|
-
scaff = get_fasta_record(protein_info.scaffold,genomedb)
|
358
|
-
scaffold_seq = Bio::Sequence::NA.new(scaff.seq)
|
359
|
-
|
360
|
-
fragment_phase = 0
|
361
|
-
ordered_coords= protein_info.strand=='+' ? coords : coords.reverse
|
362
|
-
if name=="CDS"
|
363
|
-
frag_id="#{pep_id}.fg"
|
364
|
-
else
|
365
|
-
frag_id="#{pep_id}.sp"
|
366
|
-
end
|
367
|
-
gff_lines = ordered_coords.collect do |frag_start,frag_end|
|
368
|
-
frag_naseq = scaffold_seq[frag_start-1..frag_end-1]
|
369
|
-
|
370
|
-
begin
|
371
|
-
frag_frame = fragment_phase+1
|
372
|
-
frag_seq = nil
|
373
|
-
if ( protein_info.strand=='-')
|
374
|
-
frag_seq = frag_naseq.reverse_complement.translate(frag_frame)
|
375
|
-
else
|
376
|
-
frag_seq = frag_naseq.translate(frag_frame)
|
377
|
-
end
|
378
|
-
rescue
|
379
|
-
if frag_naseq.length > 1
|
380
|
-
puts "Unable to translate #{frag_naseq}"
|
381
|
-
# require 'debugger'
|
382
|
-
end
|
383
|
-
frag_seq="*"
|
384
|
-
end
|
385
|
-
|
386
|
-
fragment_record=Bio::GFF::GFF3::Record.new(seqid = protein_info.scaffold,source="MSMS",
|
387
|
-
feature_type=name,start_position=frag_start,end_position=frag_end,score='',
|
388
|
-
strand=protein_info.strand,frame=fragment_phase,attributes=[["Parent",pep_id],["ID",frag_id],["Name",frag_seq]])
|
389
|
-
|
390
|
-
|
391
|
-
remainder=(frag_naseq.length-fragment_phase) % 3
|
392
|
-
fragment_phase=(3-remainder) % 3
|
393
|
-
|
394
|
-
fragment_record
|
395
|
-
end
|
396
|
-
|
397
|
-
|
398
|
-
concat_seq=nil
|
399
|
-
|
400
|
-
coords.each do |frag_start,frag_end|
|
401
|
-
frag_naseq = scaffold_seq[frag_start-1..frag_end-1]
|
402
|
-
concat_seq += frag_naseq unless concat_seq == nil
|
403
|
-
concat_seq = frag_naseq if concat_seq==nil
|
404
|
-
end
|
405
|
-
|
406
|
-
check_seq = protein_info.strand=='-' ? concat_seq.reverse_complement.translate : concat_seq.translate
|
407
|
-
if ( check_seq != peptide_seq)
|
408
|
-
require 'debugger';debugger
|
409
|
-
puts "Fragment seqs not equal to peptide seqs"
|
410
|
-
end
|
411
|
-
|
412
|
-
return gff_lines
|
413
|
-
|
414
|
-
end
|
415
|
-
|
416
|
-
def get_start_codon_coords_for_peptide(peptide_genomic_start,peptide_genomic_end,peptide_seq,protein_seq,strand)
|
417
|
-
pi=protein_seq.index(peptide_seq)
|
418
|
-
if ( protein_seq[pi]=='M' )
|
419
|
-
is_tryptic=false
|
420
|
-
if ( pi>0 && (protein_seq[pi-1]!='K' && protein_seq[pi-1]!='R') )
|
421
|
-
is_tryptic=true
|
422
|
-
elsif (pi==0)
|
423
|
-
is_tryptic=true
|
424
|
-
end
|
425
|
-
return nil unless is_tryptic
|
426
|
-
|
427
|
-
start_codon_coord = (strand=='+') ? peptide_genomic_start : peptide_genomic_end-2
|
428
|
-
# require 'debugger';debugger
|
429
|
-
return [start_codon_coord,start_codon_coord+2]
|
430
|
-
else
|
431
|
-
return nil
|
432
|
-
end
|
433
|
-
end
|
434
|
-
|
435
|
-
def get_cterm_coords_for_peptide(peptide_genomic_start,peptide_genomic_end,peptide_seq,protein_seq,strand)
|
436
|
-
|
437
|
-
if ( (peptide_seq[-1]!='K' && peptide_seq[-1]!='R' ) )
|
438
|
-
|
439
|
-
codon_coord = (strand=='+') ? peptide_genomic_end-3 : peptide_genomic_start+1
|
440
|
-
# require 'debugger';debugger
|
441
|
-
return [codon_coord,codon_coord+2]
|
442
|
-
else
|
443
|
-
return nil
|
444
|
-
end
|
445
|
-
end
|
446
|
-
|
447
|
-
|
448
|
-
def get_nterm_peptide_for_peptide(peptide_seq,protein_seq)
|
449
|
-
pi=protein_seq.index(peptide_seq)
|
450
|
-
if ( pi>0 && (protein_seq[pi-1]!='K' && protein_seq[pi-1]!='R' && protein_seq[pi]!='M') )
|
451
|
-
# Since trypsin sometimes cleaves before P (ie breaking the rule)
|
452
|
-
# we don't check for it and assume those cases are real tryptic termini
|
453
|
-
reverse_leader_seq=protein_seq[0..pi].reverse
|
454
|
-
mi=reverse_leader_seq.index('M')
|
455
|
-
|
456
|
-
if ( mi==nil )
|
457
|
-
puts "No methionine found ahead of peptide sequence. Unable to determine n-term sequence"
|
458
|
-
return nil
|
459
|
-
end
|
460
|
-
|
461
|
-
mi=pi-mi
|
462
|
-
|
463
|
-
ntermseq=protein_seq[mi..(pi-1)]
|
464
|
-
|
465
|
-
# if ( ntermseq.length < minlen )
|
466
|
-
# return nil
|
467
|
-
# end
|
468
|
-
|
469
|
-
# $STDOUT.write protein_seq[mi..(pi+peptide_seq.length-1)]
|
470
|
-
# require 'debugger';debugger
|
471
|
-
full_seq_with_annotations = "#{ntermseq}(cleaved)#{protein_seq[(pi..(pi+peptide_seq.length-1))]}"
|
472
|
-
|
473
|
-
return full_seq_with_annotations
|
474
|
-
else
|
475
|
-
return nil
|
476
|
-
end
|
477
|
-
end
|
478
|
-
|
479
|
-
def generate_gff_for_peptide_mapped_to_protein(protein_seq,peptide_seq,protein_info,prot_id,peptide_prob,peptide_count,dna_sequence,genomedb=nil)
|
480
|
-
|
481
|
-
prot_seq = protein_seq
|
482
|
-
pep_seq = peptide_seq
|
483
|
-
|
484
|
-
|
485
|
-
peptide_coords = get_peptide_coordinates(prot_seq,pep_seq,protein_info,dna_sequence)
|
486
|
-
|
487
|
-
if ( peptide_coords==nil ) # Return value of nil means the entry is a predicted transcript that should already be covered by 6-frame
|
488
|
-
return []
|
489
|
-
end
|
490
|
-
|
491
|
-
gff_records=[]
|
492
|
-
|
493
|
-
# Now convert peptide coordinate to genome coordinates
|
494
|
-
# And create gff lines for each match
|
495
|
-
peptide_coords.each do |coords|
|
496
|
-
|
497
|
-
# require 'debugger';debugger
|
498
|
-
pep_genomic_start = coords.first[0]
|
499
|
-
pep_genomic_end = coords.last[1]
|
500
|
-
|
501
|
-
pep_id = "#{prot_id}.p#{peptide_count.to_s}"
|
502
|
-
pep_attributes = [["ID",pep_id],["Parent",prot_id],["Name",pep_seq]]
|
503
|
-
|
504
|
-
pep_gff_line = Bio::GFF::GFF3::Record.new(seqid = protein_info.scaffold,source="MSMS",
|
505
|
-
feature_type="peptide",start_position=pep_genomic_start,end_position=pep_genomic_end,score=peptide_prob,
|
506
|
-
strand=protein_info.strand,frame=nil,attributes=pep_attributes)
|
507
|
-
|
508
|
-
# For standard peptides
|
509
|
-
frag_gffs = generate_fragment_gffs_for_coords(coords,protein_info,pep_id,peptide_seq,genomedb,"CDS")
|
510
|
-
gff_records += [pep_gff_line] + frag_gffs
|
511
|
-
# require 'debugger';debugger
|
512
|
-
# For peptides with only 1 tryptic terminus
|
513
|
-
start_codon_coords=get_start_codon_coords_for_peptide(pep_genomic_start,pep_genomic_end,peptide_seq,protein_seq,protein_info.strand)
|
514
|
-
if start_codon_coords
|
515
|
-
start_codon_gff = Bio::GFF::GFF3::Record.new(seqid = protein_info.scaffold,source="MSMS",
|
516
|
-
feature_type="start_codon",start_position=start_codon_coords[0],end_position=start_codon_coords[1],score='',
|
517
|
-
strand=protein_info.strand,frame=nil,attributes=["Parent",pep_id])
|
518
|
-
gff_records+=[start_codon_gff]
|
519
|
-
end
|
520
|
-
|
521
|
-
cterm_coords = get_cterm_coords_for_peptide(pep_genomic_start,pep_genomic_end,peptide_seq,protein_seq,protein_info.strand)
|
522
|
-
if ( cterm_coords )
|
523
|
-
cterm_gff = Bio::GFF::GFF3::Record.new(seqid = protein_info.scaffold,source="MSMS",
|
524
|
-
feature_type="cterm",start_position=cterm_coords[0],end_position=cterm_coords[1],score='',
|
525
|
-
strand=protein_info.strand,frame=nil,attributes=["Parent",pep_id])
|
526
|
-
gff_records+=[start_codon_gff]
|
527
|
-
end
|
528
|
-
|
529
|
-
end
|
530
|
-
# puts gff_records
|
531
|
-
|
532
|
-
gff_records
|
533
|
-
end
|
534
|
-
|
535
|
-
def add_putative_nterm_to_gff(gff_records,peptide_seq,protein_seq,protein_info,prot_id,peptide_count,dna_sequence,genomedb)
|
536
|
-
pep_id = "#{prot_id}.p#{peptide_count.to_s}"
|
537
|
-
signal_peptide = get_nterm_peptide_for_peptide(peptide_seq,protein_seq)
|
538
|
-
if signal_peptide
|
539
|
-
$stdout.write "Nterm\t#{signal_peptide}\t#{protein_info.name}\t#{protein_seq}\n"
|
540
|
-
raw_signal_peptide=signal_peptide.sub(/\(cleaved\)/,"")
|
541
|
-
# Get raw signal_peptide sequence
|
542
|
-
|
543
|
-
signal_peptide_coords=get_peptide_coordinates(protein_seq,raw_signal_peptide,protein_info,dna_sequence)
|
544
|
-
if signal_peptide_coords
|
545
|
-
signal_peptide_coords.each do |spcoords|
|
546
|
-
signal_peptide_gff = generate_fragment_gffs_for_coords(spcoords,protein_info,pep_id,raw_signal_peptide,genomedb,"signalpeptide")
|
547
|
-
gff_records += signal_peptide_gff
|
548
|
-
end
|
549
|
-
end
|
550
|
-
end
|
551
|
-
end
|
552
|
-
|
553
|
-
def peptide_gff_is_duplicate(peptide_gff,peptides_covered_genome)
|
554
|
-
nameindex = peptide_gff.attributes.index {|obj| obj[0]=="Name" }
|
555
|
-
pep_seq = peptide_gff.attributes[nameindex][1]
|
556
|
-
existing = peptides_covered_genome[pep_seq]
|
557
|
-
return true if existing==peptide_gff.start
|
558
|
-
|
559
|
-
return false
|
560
|
-
end
|
561
|
-
|
562
|
-
proteins = parse_proteins(tool.protxml)
|
65
|
+
proteins = parse_proteins(input_proxml)
|
563
66
|
fastadb = prepare_fasta(tool.database,'prot')
|
564
67
|
genomedb = nil
|
565
68
|
if tool.genome
|
@@ -583,7 +86,7 @@ for prot in proteins
|
|
583
86
|
end
|
584
87
|
|
585
88
|
# Gets identifiers of all proteins (includeing indistinguishable ones)
|
586
|
-
prot_names=protein_names(prot)
|
89
|
+
prot_names=tool.protein_names(prot)
|
587
90
|
|
588
91
|
|
589
92
|
if tool.protein_find!=nil
|
@@ -591,19 +94,19 @@ for prot in proteins
|
|
591
94
|
end
|
592
95
|
|
593
96
|
|
594
|
-
peptides=peptide_nodes(prot)
|
97
|
+
peptides=tool.peptide_nodes(prot)
|
595
98
|
entries_covered=[]
|
596
99
|
for protein_name in prot_names
|
597
100
|
protein_count += 1
|
598
101
|
prot_id = "pr#{protein_count.to_s}"
|
599
102
|
begin
|
600
103
|
|
601
|
-
protein_fasta_entry = get_fasta_record(protein_name,fastadb)
|
602
|
-
protein_info = cds_info_from_fasta(protein_fasta_entry)
|
104
|
+
protein_fasta_entry = tool.get_fasta_record(protein_name,fastadb)
|
105
|
+
protein_info = tool.cds_info_from_fasta(protein_fasta_entry)
|
603
106
|
|
604
|
-
unless (tool.collapse_redundant_proteins && !is_new_genome_location(protein_info,entries_covered) )
|
107
|
+
unless (tool.collapse_redundant_proteins && !tool.is_new_genome_location(protein_info,entries_covered) )
|
605
108
|
|
606
|
-
protein_gff = generate_protein_gff(protein_name,protein_info,prot_prob,protein_count)
|
109
|
+
protein_gff = tool.generate_protein_gff(protein_name,protein_info,prot_prob,protein_count)
|
607
110
|
|
608
111
|
gff_db.records += ["##gff-version 3\n","##sequence-region #{protein_info.scaffold} 1 160\n",protein_gff]
|
609
112
|
|
@@ -624,15 +127,15 @@ for prot in proteins
|
|
624
127
|
dna_sequence=nil
|
625
128
|
if !protein_info.is_sixframe
|
626
129
|
throw "A genome is required if predicted transcripts are to be mapped" unless genomedb!=nil
|
627
|
-
dna_sequence = get_dna_sequence(protein_info,genomedb)
|
130
|
+
dna_sequence = tool.get_dna_sequence(protein_info,genomedb)
|
628
131
|
end
|
629
132
|
|
630
133
|
|
631
|
-
peptide_gff = generate_gff_for_peptide_mapped_to_protein(prot_seq,pep_seq,protein_info,prot_id,pprob,peptide_count,dna_sequence,genomedb)
|
134
|
+
peptide_gff = tool.generate_gff_for_peptide_mapped_to_protein(prot_seq,pep_seq,protein_info,prot_id,pprob,peptide_count,dna_sequence,genomedb)
|
632
135
|
|
633
|
-
unless (peptide_gff.length==0 || peptide_gff_is_duplicate(peptide_gff[0],peptides_covered_genome))
|
136
|
+
unless (peptide_gff.length==0 || tool.peptide_gff_is_duplicate(peptide_gff[0],peptides_covered_genome))
|
634
137
|
|
635
|
-
add_putative_nterm_to_gff(peptide_gff,pep_seq,prot_seq,protein_info,prot_id,peptide_count,dna_sequence,genomedb)
|
138
|
+
tool.add_putative_nterm_to_gff(peptide_gff,pep_seq,prot_seq,protein_info,prot_id,peptide_count,dna_sequence,genomedb)
|
636
139
|
|
637
140
|
gff_db.records += peptide_gff
|
638
141
|
|
data/bin/protxml_to_table.rb
CHANGED
@@ -19,23 +19,9 @@ include LibXML
|
|
19
19
|
tool=Tool.new([:explicit_output])
|
20
20
|
tool.option_parser.banner = "Convert a protXML file to a tab delimited table.\n\nUsage: protxml_to_table.rb [options] file1.protXML"
|
21
21
|
|
22
|
-
tool.
|
23
|
-
tool.option_parser.on("--groups","Print output by groups rather than for each protein") do
|
24
|
-
tool.options.groups=true
|
25
|
-
end
|
26
|
-
|
27
|
-
# tool.options.proteinid_regex=".*?\|.*?\|(.*)"
|
28
|
-
# tool.option_parser.on( '--regex rexpr', 'Regex' ) do |regex|
|
29
|
-
# tool.options.proteinid_regex=regex
|
30
|
-
# end
|
22
|
+
tool.add_boolean_option(:groups,false,["--groups","Print output by groups rather than for each protein"])
|
31
23
|
|
32
|
-
exit unless tool.check_options
|
33
|
-
|
34
|
-
if ( ARGV[0].nil? )
|
35
|
-
puts "You must supply an input file"
|
36
|
-
puts tool.option_parser
|
37
|
-
exit
|
38
|
-
end
|
24
|
+
exit unless tool.check_options(true)
|
39
25
|
|
40
26
|
input_file=ARGV[0]
|
41
27
|
|