protk 1.2.6.pre5 → 1.3.0.pre1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +84 -45
- data/bin/add_retention_times.rb +9 -5
- data/bin/augustus_to_proteindb.rb +7 -11
- data/bin/interprophet.rb +28 -46
- data/bin/make_decoy.rb +16 -48
- data/bin/mascot_search.rb +57 -71
- data/bin/mascot_to_pepxml.rb +13 -26
- data/bin/msgfplus_search.rb +70 -107
- data/bin/omssa_search.rb +52 -109
- data/bin/peptide_prophet.rb +44 -119
- data/bin/pepxml_to_table.rb +24 -27
- data/bin/protein_prophet.rb +22 -82
- data/bin/protxml_to_gff.rb +22 -519
- data/bin/protxml_to_table.rb +2 -16
- data/bin/sixframe.rb +10 -32
- data/bin/tandem_search.rb +30 -403
- data/bin/tandem_to_pepxml.rb +43 -0
- data/bin/unimod_to_loc.rb +1 -1
- data/ext/{protk/decoymaker → decoymaker}/decoymaker.c +74 -21
- data/ext/decoymaker/extconf.rb +3 -0
- data/lib/protk/constants.rb +16 -2
- data/lib/protk/data/default_config.yml +2 -1
- data/lib/protk/data/tandem_gpm_defaults.xml +175 -0
- data/lib/protk/data/tandem_isb_kscore_defaults.xml +123 -0
- data/lib/protk/data/tandem_isb_native_defaults.xml +123 -0
- data/lib/protk/data/tandem_params.xml +17 -54
- data/lib/protk/fastadb.rb +2 -2
- data/lib/protk/prophet_tool.rb +1 -1
- data/lib/protk/protxml_to_gff_tool.rb +474 -0
- data/lib/protk/search_tool.rb +58 -103
- data/lib/protk/setup_rakefile.rake +9 -5
- data/lib/protk/tandem_search_tool.rb +256 -0
- data/lib/protk/tool.rb +85 -104
- data/lib/protk.rb +1 -6
- metadata +24 -103
- data/bin/annotate_ids.rb +0 -59
- data/bin/asapratio.rb +0 -27
- data/bin/blastxml_to_table.rb +0 -119
- data/bin/correct_omssa_retention_times.rb +0 -27
- data/bin/feature_finder.rb +0 -95
- data/bin/file_convert.rb +0 -164
- data/bin/generate_omssa_loc.rb +0 -42
- data/bin/gffmerge.rb +0 -208
- data/bin/libra.rb +0 -70
- data/bin/toppas_pipeline.rb +0 -84
- data/bin/uniprot_annotation.rb +0 -141
- data/bin/xls_to_table.rb +0 -52
- data/bin/xpress.rb +0 -27
- data/ext/protk/decoymaker/extconf.rb +0 -3
- data/ext/protk/simplealign/extconf.rb +0 -3
- data/lib/protk/biotools_excel_converter.rb +0 -60
- data/lib/protk/eupathdb_gene_information_table.rb +0 -158
- data/lib/protk/gapped_aligner.rb +0 -264
- data/lib/protk/protein_annotator.rb +0 -646
- data/lib/protk/spreadsheet_extensions.rb +0 -79
- data/lib/protk/xtandem_defaults.rb +0 -11
data/bin/protxml_to_gff.rb
CHANGED
@@ -7,73 +7,26 @@
|
|
7
7
|
#
|
8
8
|
|
9
9
|
require 'protk/constants'
|
10
|
-
require 'protk/
|
10
|
+
require 'protk/protxml_to_gff_tool'
|
11
11
|
require 'protk/fastadb'
|
12
|
-
require 'protk/gapped_aligner'
|
13
12
|
require 'libxml'
|
14
13
|
require 'bio'
|
15
14
|
|
16
15
|
include LibXML
|
17
16
|
|
18
|
-
tool=
|
19
|
-
tool.option_parser.banner = "Create a gff containing peptide Observations.\n\nUsage: protxml_to_gff.rb "
|
17
|
+
tool=ProtXMLToGFFTool.new()
|
20
18
|
|
19
|
+
@output_extension=".gff"
|
20
|
+
@output_suffix=""
|
21
21
|
|
22
|
-
tool.
|
23
|
-
tool.option_parser.on( '-p filename','--protxml filename', 'Observed Data (ProtXML Format)' ) do |file|
|
24
|
-
tool.options.protxml=file
|
25
|
-
end
|
26
|
-
|
27
|
-
tool.options.database=nil
|
28
|
-
tool.option_parser.on( '-d filename','--database filename', 'Database used for ms/ms searches (Fasta Format)' ) do |file|
|
29
|
-
tool.options.database=file
|
30
|
-
end
|
22
|
+
exit unless tool.check_options(true,[:database])
|
31
23
|
|
32
|
-
|
33
|
-
tool.option_parser.on( '-f term','--find term', 'Restrict output to proteins whose name matches the specified string' ) do |term|
|
34
|
-
tool.options.protein_find=term
|
35
|
-
end
|
36
|
-
|
37
|
-
tool.options.nterm_minlen=7
|
38
|
-
tool.option_parser.on( '-n len','--nterm-min-len len', 'Only include inferred N-terminal sequences if longer than len' ) do |len|
|
39
|
-
tool.options.nterm_minlen=len
|
40
|
-
end
|
24
|
+
input_proxml=ARGV[0]
|
41
25
|
|
42
|
-
tool.
|
43
|
-
tool.
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
tool.options.skip_fasta_indexing=false
|
48
|
-
tool.option_parser.on('--skip-index','Don\'t index database (Index should already exist)') do
|
49
|
-
tool.options.skip_fasta_indexing=true
|
50
|
-
end
|
51
|
-
|
52
|
-
tool.options.stack_charge_states=false
|
53
|
-
tool.option_parser.on('--stack-charge-states','Different peptide charge states get separate gff entries') do
|
54
|
-
tool.options.stack_charge_states=true
|
55
|
-
end
|
56
|
-
|
57
|
-
tool.options.collapse_redundant_proteins=false
|
58
|
-
tool.option_parser.on('--collapse-redundant-proteins','Proteins that cover genomic regions already covered will be skipped') do
|
59
|
-
tool.options.collapse_redundant_proteins=true
|
60
|
-
end
|
61
|
-
|
62
|
-
tool.options.peptide_probability_threshold=0.95
|
63
|
-
tool.option_parser.on('--threshold prob','Peptide Probability Threshold (Default 0.95)') do |thresh|
|
64
|
-
tool.options.peptide_probability_threshold=thresh.to_f
|
65
|
-
end
|
66
|
-
|
67
|
-
tool.options.protein_probability_threshold=0.99
|
68
|
-
tool.option_parser.on('--prot-threshold prob','Protein Probability Threshold (Default 0.99)') do |thresh|
|
69
|
-
tool.options.protein_probability_threshold=thresh.to_f
|
70
|
-
end
|
71
|
-
|
72
|
-
exit unless tool.check_options [:protxml,:database]
|
73
|
-
|
74
|
-
gff_out_file="peptides.gff"
|
75
|
-
if ( tool.explicit_output != nil)
|
76
|
-
gff_out_file=tool.explicit_output
|
26
|
+
if ( tool.explicit_output!=nil)
|
27
|
+
gff_out_file=tool.explicit_output
|
28
|
+
else
|
29
|
+
gff_out_file=Tool.default_output_path(input_proxml,@output_extension,tool.output_prefix,@output_suffix)
|
77
30
|
end
|
78
31
|
|
79
32
|
gff_db = Bio::GFF.new()
|
@@ -92,7 +45,7 @@ def prepare_fasta(database_path,type)
|
|
92
45
|
db_filename = nil
|
93
46
|
case
|
94
47
|
when Pathname.new(database_path).exist? # It's an explicitly named db
|
95
|
-
db_filename = Pathname.new(database_path).
|
48
|
+
db_filename = Pathname.new(database_path).expand_path.to_s
|
96
49
|
else
|
97
50
|
db_filename=Constants.new.current_database_for_name(database_path)
|
98
51
|
end
|
@@ -109,457 +62,7 @@ def prepare_fasta(database_path,type)
|
|
109
62
|
orf_lookup
|
110
63
|
end
|
111
64
|
|
112
|
-
|
113
|
-
indis_proteins = protein_node.find('protxml:indistinguishable_protein','protxml:http://regis-web.systemsbiology.net/protXML')
|
114
|
-
prot_names = [protein_node['protein_name']]
|
115
|
-
for protein in indis_proteins
|
116
|
-
prot_names += [protein['protein_name']]
|
117
|
-
end
|
118
|
-
prot_names
|
119
|
-
end
|
120
|
-
|
121
|
-
def peptide_nodes(protein_node)
|
122
|
-
return protein_node.find('protxml:peptide','protxml:http://regis-web.systemsbiology.net/protXML')
|
123
|
-
end
|
124
|
-
|
125
|
-
|
126
|
-
def get_fasta_record(protein_name,fastadb)
|
127
|
-
# puts "Looking up #{protein_name}"
|
128
|
-
entry = fastadb.get_by_id protein_name
|
129
|
-
if ( entry == nil)
|
130
|
-
puts "Failed lookup for #{protein_name}"
|
131
|
-
raise KeyError
|
132
|
-
end
|
133
|
-
entry
|
134
|
-
end
|
135
|
-
|
136
|
-
class CDSInfo
|
137
|
-
attr_accessor :fasta_id
|
138
|
-
attr_accessor :strand
|
139
|
-
attr_accessor :frame
|
140
|
-
attr_accessor :name
|
141
|
-
attr_accessor :scaffold
|
142
|
-
attr_accessor :start
|
143
|
-
attr_accessor :end
|
144
|
-
attr_accessor :coding_sequences
|
145
|
-
attr_accessor :is_sixframe
|
146
|
-
attr_accessor :gene_id
|
147
|
-
|
148
|
-
def overlap(candidate_entry)
|
149
|
-
return false if candidate_entry.scaffold!=self.scaffold
|
150
|
-
return false if strand!=self.strand
|
151
|
-
return false if candidate_entry.start >= self.end
|
152
|
-
return false if self.start <= candidate_entry.end
|
153
|
-
return true
|
154
|
-
end
|
155
|
-
|
156
|
-
end
|
157
|
-
|
158
|
-
def cds_info_from_fasta(fasta_entry)
|
159
|
-
info=CDSInfo.new
|
160
|
-
info.fasta_id=fasta_entry
|
161
|
-
positions = fasta_entry.identifiers.description.split(' ').collect { |coords| coords.split('|').collect {|pos| pos.to_i} }
|
162
|
-
info.coding_sequences=[]
|
163
|
-
info.gene_id
|
164
|
-
if ( positions.length < 1 )
|
165
|
-
raise EncodingError
|
166
|
-
elsif ( positions.length > 1)
|
167
|
-
info.coding_sequences = positions[1..-1]
|
168
|
-
end
|
169
|
-
|
170
|
-
info.start = positions[0][0]
|
171
|
-
info.end = positions[0][1]
|
172
|
-
|
173
|
-
info.scaffold=fasta_entry.entry_id.scan(/(scaffold_?\d+)_/)[0][0]
|
174
|
-
info.name = fasta_entry.entry_id.scan(/lcl\|(.*)/)[0][0]
|
175
|
-
|
176
|
-
if fasta_entry.entry_id =~ /frame/
|
177
|
-
info.frame=info.name.scan(/frame_(\d)/)[0][0]
|
178
|
-
info.strand = (info.frame.to_i > 3) ? '-' : '+'
|
179
|
-
info.is_sixframe = true
|
180
|
-
else
|
181
|
-
info.strand = (info.name =~ /rev/) ? '-' : '+'
|
182
|
-
info.gene_id=info.name.scan(/_\w{3}_(.*)\.t/)[0][0]
|
183
|
-
info.is_sixframe = false
|
184
|
-
end
|
185
|
-
info
|
186
|
-
end
|
187
|
-
|
188
|
-
|
189
|
-
def is_new_genome_location(candidate_entry,existing_entries)
|
190
|
-
# puts existing_entries
|
191
|
-
# require 'debugger';debugger
|
192
|
-
|
193
|
-
# genes=existing_entries.collect { |e| e.gene_id }.compact
|
194
|
-
|
195
|
-
# if genes.include?(candidate_entry.gene_id)
|
196
|
-
# return false
|
197
|
-
# end
|
198
|
-
|
199
|
-
existing_entries.each do |existing|
|
200
|
-
return false if existing.gene_id==candidate_entry.gene_id
|
201
|
-
return false if existing.overlap(candidate_entry)
|
202
|
-
end
|
203
|
-
|
204
|
-
return true
|
205
|
-
end
|
206
|
-
|
207
|
-
|
208
|
-
def generate_protein_gff(protein_name,entry_info,prot_prob,prot_id)
|
209
|
-
prot_qualifiers = {"source" => "MSMS", "score" => prot_prob, "ID" => prot_id}
|
210
|
-
prot_attributes = [["ID",prot_id],["Name",entry_info.name]]
|
211
|
-
prot_gff_line = Bio::GFF::GFF3::Record.new(seqid = entry_info.scaffold,source="MSMS",feature_type="protein",
|
212
|
-
start_position=entry_info.start,end_position=entry_info.end,score=prot_prob,strand=entry_info.strand,frame=nil,attributes=prot_attributes)
|
213
|
-
prot_gff_line
|
214
|
-
end
|
215
|
-
|
216
|
-
def get_dna_sequence(protein_info,genomedb)
|
217
|
-
|
218
|
-
scaffold_sequence = get_fasta_record(protein_info.scaffold,genomedb)
|
219
|
-
gene_sequence = scaffold_sequence.naseq.to_s[(protein_info.start-1)..protein_info.end]
|
220
|
-
|
221
|
-
if ( protein_info.strand == "-")
|
222
|
-
gene_sequence = Bio::Sequence::NA.new(gene_sequence).reverse_complement
|
223
|
-
end
|
224
|
-
|
225
|
-
gene_sequence
|
226
|
-
end
|
227
|
-
|
228
|
-
def peptide_is_in_sixframe(pep_seq,gene_seq)
|
229
|
-
gs=Bio::Sequence::NA.new(gene_seq)
|
230
|
-
(1..6).each do |frame|
|
231
|
-
if gs.translate(frame).index(pep_seq)
|
232
|
-
return true
|
233
|
-
end
|
234
|
-
end
|
235
|
-
return false
|
236
|
-
end
|
237
|
-
|
238
|
-
def fragment_coords_from_protein_coords(pepstart,pepend,gene_start,gene_end,coding_sequences)
|
239
|
-
|
240
|
-
sorted_cds = coding_sequences.sort { |a, b| a[0] <=> b[0] }
|
241
|
-
|
242
|
-
|
243
|
-
# Assume positive strand
|
244
|
-
pi_start=pepstart*3+gene_start-1
|
245
|
-
pi_end=pepend*3+gene_start-1
|
246
|
-
|
247
|
-
fragments=[]
|
248
|
-
p_i = pi_start #Initially we are looking for the first fragment
|
249
|
-
finding_start=true
|
250
|
-
|
251
|
-
sorted_cds.each_with_index do |cds_coords, i|
|
252
|
-
cds_start=cds_coords[0]
|
253
|
-
cds_end = cds_coords[1]
|
254
|
-
if cds_end < p_i # Exon is before index in sequence and doesn't contain p_i
|
255
|
-
if sorted_cds.length <= i+1
|
256
|
-
require 'debugger';debugger
|
257
|
-
end
|
258
|
-
|
259
|
-
next_coords = sorted_cds[i+1]
|
260
|
-
intron_offset = ((next_coords[0]-cds_end)-1)
|
261
|
-
p_i+=intron_offset
|
262
|
-
pi_end+=intron_offset
|
263
|
-
if !finding_start
|
264
|
-
# This is a middle exon
|
265
|
-
fragments << [cds_start,cds_end]
|
266
|
-
end
|
267
|
-
else
|
268
|
-
if finding_start
|
269
|
-
|
270
|
-
if ( pi_end <= cds_end) #Whole peptide contained in a single exon
|
271
|
-
fragments << [p_i+1,pi_end]
|
272
|
-
break;
|
273
|
-
end
|
274
|
-
|
275
|
-
|
276
|
-
fragments << [p_i+1,(cds_end)]
|
277
|
-
next_coords = sorted_cds[i+1]
|
278
|
-
intron_offset = ((next_coords[0]-cds_end)-1)
|
279
|
-
p_i+=intron_offset
|
280
|
-
pi_end+=intron_offset
|
281
|
-
p_i = pi_end
|
282
|
-
finding_start=false
|
283
|
-
else # A terminal exon
|
284
|
-
# require 'debugger';debugger
|
285
|
-
fragments << [(cds_start),(p_i)]
|
286
|
-
break;
|
287
|
-
end
|
288
|
-
end
|
289
|
-
end
|
290
|
-
[fragments]
|
291
|
-
end
|
292
|
-
|
293
|
-
# gene_seq should already have been reverse_complemented if on reverse strand
|
294
|
-
def get_peptide_coordinates_from_transcript_info(prot_seq,pep_seq,protein_info,gene_seq)
|
295
|
-
# if ( peptide_is_in_sixframe(pep_seq,gene_seq))
|
296
|
-
# Peptide is in 6-frame but on a predicted transcript
|
297
|
-
# return nil
|
298
|
-
# else
|
299
|
-
|
300
|
-
# puts "Found a gap #{protein_info.fasta_id}"
|
301
|
-
if protein_info.strand=='-'
|
302
|
-
pep_index = prot_seq.reverse.index(pep_seq.reverse)
|
303
|
-
if pep_index==nil
|
304
|
-
# require 'debugger';debugger
|
305
|
-
puts "Warning: Unable to find peptide #{pep_seq} in this protein! #{protein_info}"
|
306
|
-
return nil
|
307
|
-
end
|
308
|
-
pep_start_i = prot_seq.reverse.index(pep_seq.reverse)+1
|
309
|
-
# Plus 1 because on reverse stand stop-codon will be at the beginning of the sequence (when read forwards). Need to eliminate it.
|
310
|
-
else
|
311
|
-
pep_start_i = prot_seq.index(pep_seq)
|
312
|
-
if pep_start_i==nil
|
313
|
-
# require 'debugger';debugger
|
314
|
-
puts "Warning: Unable to find peptide #{pep_seq} in this protein! #{protein_info}"
|
315
|
-
return nil
|
316
|
-
end
|
317
|
-
end
|
318
|
-
pep_end_i = pep_start_i+pep_seq.length
|
319
|
-
|
320
|
-
return fragment_coords_from_protein_coords(pep_start_i,pep_end_i,protein_info.start,protein_info.end,protein_info.coding_sequences)
|
321
|
-
# end
|
322
|
-
end
|
323
|
-
|
324
|
-
def get_peptide_coordinates_sixframe(prot_seq,pep_seq,protein_info)
|
325
|
-
|
326
|
-
if ( protein_info.strand == '-' )
|
327
|
-
prot_seq = prot_seq.reverse
|
328
|
-
pep_seq = pep_seq.reverse
|
329
|
-
end
|
330
|
-
|
331
|
-
start_indexes = [0]
|
332
|
-
|
333
|
-
prot_seq.scan /#{pep_seq}/ do |match|
|
334
|
-
start_indexes << prot_seq.index(match,start_indexes.last)
|
335
|
-
end
|
336
|
-
start_indexes.delete_at(0)
|
337
|
-
|
338
|
-
start_indexes.collect do |si|
|
339
|
-
pep_genomic_start = protein_info.start + 3*si
|
340
|
-
pep_genomic_end = pep_genomic_start + 3*pep_seq.length - 1
|
341
|
-
[[pep_genomic_start,pep_genomic_end]]
|
342
|
-
end
|
343
|
-
|
344
|
-
end
|
345
|
-
|
346
|
-
# Returns a 4-mer [genomic_start,fragment1_end(or0),frag2_start(or0),genomic_end]
|
347
|
-
def get_peptide_coordinates(prot_seq,pep_seq,protein_info,gene_seq)
|
348
|
-
if ( protein_info.is_sixframe)
|
349
|
-
return get_peptide_coordinates_sixframe(prot_seq,pep_seq,protein_info)
|
350
|
-
else
|
351
|
-
return get_peptide_coordinates_from_transcript_info(prot_seq,pep_seq,protein_info,gene_seq)
|
352
|
-
end
|
353
|
-
end
|
354
|
-
|
355
|
-
|
356
|
-
def generate_fragment_gffs_for_coords(coords,protein_info,pep_id,peptide_seq,genomedb,name="fragment")
|
357
|
-
scaff = get_fasta_record(protein_info.scaffold,genomedb)
|
358
|
-
scaffold_seq = Bio::Sequence::NA.new(scaff.seq)
|
359
|
-
|
360
|
-
fragment_phase = 0
|
361
|
-
ordered_coords= protein_info.strand=='+' ? coords : coords.reverse
|
362
|
-
if name=="CDS"
|
363
|
-
frag_id="#{pep_id}.fg"
|
364
|
-
else
|
365
|
-
frag_id="#{pep_id}.sp"
|
366
|
-
end
|
367
|
-
gff_lines = ordered_coords.collect do |frag_start,frag_end|
|
368
|
-
frag_naseq = scaffold_seq[frag_start-1..frag_end-1]
|
369
|
-
|
370
|
-
begin
|
371
|
-
frag_frame = fragment_phase+1
|
372
|
-
frag_seq = nil
|
373
|
-
if ( protein_info.strand=='-')
|
374
|
-
frag_seq = frag_naseq.reverse_complement.translate(frag_frame)
|
375
|
-
else
|
376
|
-
frag_seq = frag_naseq.translate(frag_frame)
|
377
|
-
end
|
378
|
-
rescue
|
379
|
-
if frag_naseq.length > 1
|
380
|
-
puts "Unable to translate #{frag_naseq}"
|
381
|
-
# require 'debugger'
|
382
|
-
end
|
383
|
-
frag_seq="*"
|
384
|
-
end
|
385
|
-
|
386
|
-
fragment_record=Bio::GFF::GFF3::Record.new(seqid = protein_info.scaffold,source="MSMS",
|
387
|
-
feature_type=name,start_position=frag_start,end_position=frag_end,score='',
|
388
|
-
strand=protein_info.strand,frame=fragment_phase,attributes=[["Parent",pep_id],["ID",frag_id],["Name",frag_seq]])
|
389
|
-
|
390
|
-
|
391
|
-
remainder=(frag_naseq.length-fragment_phase) % 3
|
392
|
-
fragment_phase=(3-remainder) % 3
|
393
|
-
|
394
|
-
fragment_record
|
395
|
-
end
|
396
|
-
|
397
|
-
|
398
|
-
concat_seq=nil
|
399
|
-
|
400
|
-
coords.each do |frag_start,frag_end|
|
401
|
-
frag_naseq = scaffold_seq[frag_start-1..frag_end-1]
|
402
|
-
concat_seq += frag_naseq unless concat_seq == nil
|
403
|
-
concat_seq = frag_naseq if concat_seq==nil
|
404
|
-
end
|
405
|
-
|
406
|
-
check_seq = protein_info.strand=='-' ? concat_seq.reverse_complement.translate : concat_seq.translate
|
407
|
-
if ( check_seq != peptide_seq)
|
408
|
-
require 'debugger';debugger
|
409
|
-
puts "Fragment seqs not equal to peptide seqs"
|
410
|
-
end
|
411
|
-
|
412
|
-
return gff_lines
|
413
|
-
|
414
|
-
end
|
415
|
-
|
416
|
-
def get_start_codon_coords_for_peptide(peptide_genomic_start,peptide_genomic_end,peptide_seq,protein_seq,strand)
|
417
|
-
pi=protein_seq.index(peptide_seq)
|
418
|
-
if ( protein_seq[pi]=='M' )
|
419
|
-
is_tryptic=false
|
420
|
-
if ( pi>0 && (protein_seq[pi-1]!='K' && protein_seq[pi-1]!='R') )
|
421
|
-
is_tryptic=true
|
422
|
-
elsif (pi==0)
|
423
|
-
is_tryptic=true
|
424
|
-
end
|
425
|
-
return nil unless is_tryptic
|
426
|
-
|
427
|
-
start_codon_coord = (strand=='+') ? peptide_genomic_start : peptide_genomic_end-2
|
428
|
-
# require 'debugger';debugger
|
429
|
-
return [start_codon_coord,start_codon_coord+2]
|
430
|
-
else
|
431
|
-
return nil
|
432
|
-
end
|
433
|
-
end
|
434
|
-
|
435
|
-
def get_cterm_coords_for_peptide(peptide_genomic_start,peptide_genomic_end,peptide_seq,protein_seq,strand)
|
436
|
-
|
437
|
-
if ( (peptide_seq[-1]!='K' && peptide_seq[-1]!='R' ) )
|
438
|
-
|
439
|
-
codon_coord = (strand=='+') ? peptide_genomic_end-3 : peptide_genomic_start+1
|
440
|
-
# require 'debugger';debugger
|
441
|
-
return [codon_coord,codon_coord+2]
|
442
|
-
else
|
443
|
-
return nil
|
444
|
-
end
|
445
|
-
end
|
446
|
-
|
447
|
-
|
448
|
-
def get_nterm_peptide_for_peptide(peptide_seq,protein_seq)
|
449
|
-
pi=protein_seq.index(peptide_seq)
|
450
|
-
if ( pi>0 && (protein_seq[pi-1]!='K' && protein_seq[pi-1]!='R' && protein_seq[pi]!='M') )
|
451
|
-
# Since trypsin sometimes cleaves before P (ie breaking the rule)
|
452
|
-
# we don't check for it and assume those cases are real tryptic termini
|
453
|
-
reverse_leader_seq=protein_seq[0..pi].reverse
|
454
|
-
mi=reverse_leader_seq.index('M')
|
455
|
-
|
456
|
-
if ( mi==nil )
|
457
|
-
puts "No methionine found ahead of peptide sequence. Unable to determine n-term sequence"
|
458
|
-
return nil
|
459
|
-
end
|
460
|
-
|
461
|
-
mi=pi-mi
|
462
|
-
|
463
|
-
ntermseq=protein_seq[mi..(pi-1)]
|
464
|
-
|
465
|
-
# if ( ntermseq.length < minlen )
|
466
|
-
# return nil
|
467
|
-
# end
|
468
|
-
|
469
|
-
# $STDOUT.write protein_seq[mi..(pi+peptide_seq.length-1)]
|
470
|
-
# require 'debugger';debugger
|
471
|
-
full_seq_with_annotations = "#{ntermseq}(cleaved)#{protein_seq[(pi..(pi+peptide_seq.length-1))]}"
|
472
|
-
|
473
|
-
return full_seq_with_annotations
|
474
|
-
else
|
475
|
-
return nil
|
476
|
-
end
|
477
|
-
end
|
478
|
-
|
479
|
-
def generate_gff_for_peptide_mapped_to_protein(protein_seq,peptide_seq,protein_info,prot_id,peptide_prob,peptide_count,dna_sequence,genomedb=nil)
|
480
|
-
|
481
|
-
prot_seq = protein_seq
|
482
|
-
pep_seq = peptide_seq
|
483
|
-
|
484
|
-
|
485
|
-
peptide_coords = get_peptide_coordinates(prot_seq,pep_seq,protein_info,dna_sequence)
|
486
|
-
|
487
|
-
if ( peptide_coords==nil ) # Return value of nil means the entry is a predicted transcript that should already be covered by 6-frame
|
488
|
-
return []
|
489
|
-
end
|
490
|
-
|
491
|
-
gff_records=[]
|
492
|
-
|
493
|
-
# Now convert peptide coordinate to genome coordinates
|
494
|
-
# And create gff lines for each match
|
495
|
-
peptide_coords.each do |coords|
|
496
|
-
|
497
|
-
# require 'debugger';debugger
|
498
|
-
pep_genomic_start = coords.first[0]
|
499
|
-
pep_genomic_end = coords.last[1]
|
500
|
-
|
501
|
-
pep_id = "#{prot_id}.p#{peptide_count.to_s}"
|
502
|
-
pep_attributes = [["ID",pep_id],["Parent",prot_id],["Name",pep_seq]]
|
503
|
-
|
504
|
-
pep_gff_line = Bio::GFF::GFF3::Record.new(seqid = protein_info.scaffold,source="MSMS",
|
505
|
-
feature_type="peptide",start_position=pep_genomic_start,end_position=pep_genomic_end,score=peptide_prob,
|
506
|
-
strand=protein_info.strand,frame=nil,attributes=pep_attributes)
|
507
|
-
|
508
|
-
# For standard peptides
|
509
|
-
frag_gffs = generate_fragment_gffs_for_coords(coords,protein_info,pep_id,peptide_seq,genomedb,"CDS")
|
510
|
-
gff_records += [pep_gff_line] + frag_gffs
|
511
|
-
# require 'debugger';debugger
|
512
|
-
# For peptides with only 1 tryptic terminus
|
513
|
-
start_codon_coords=get_start_codon_coords_for_peptide(pep_genomic_start,pep_genomic_end,peptide_seq,protein_seq,protein_info.strand)
|
514
|
-
if start_codon_coords
|
515
|
-
start_codon_gff = Bio::GFF::GFF3::Record.new(seqid = protein_info.scaffold,source="MSMS",
|
516
|
-
feature_type="start_codon",start_position=start_codon_coords[0],end_position=start_codon_coords[1],score='',
|
517
|
-
strand=protein_info.strand,frame=nil,attributes=["Parent",pep_id])
|
518
|
-
gff_records+=[start_codon_gff]
|
519
|
-
end
|
520
|
-
|
521
|
-
cterm_coords = get_cterm_coords_for_peptide(pep_genomic_start,pep_genomic_end,peptide_seq,protein_seq,protein_info.strand)
|
522
|
-
if ( cterm_coords )
|
523
|
-
cterm_gff = Bio::GFF::GFF3::Record.new(seqid = protein_info.scaffold,source="MSMS",
|
524
|
-
feature_type="cterm",start_position=cterm_coords[0],end_position=cterm_coords[1],score='',
|
525
|
-
strand=protein_info.strand,frame=nil,attributes=["Parent",pep_id])
|
526
|
-
gff_records+=[start_codon_gff]
|
527
|
-
end
|
528
|
-
|
529
|
-
end
|
530
|
-
# puts gff_records
|
531
|
-
|
532
|
-
gff_records
|
533
|
-
end
|
534
|
-
|
535
|
-
def add_putative_nterm_to_gff(gff_records,peptide_seq,protein_seq,protein_info,prot_id,peptide_count,dna_sequence,genomedb)
|
536
|
-
pep_id = "#{prot_id}.p#{peptide_count.to_s}"
|
537
|
-
signal_peptide = get_nterm_peptide_for_peptide(peptide_seq,protein_seq)
|
538
|
-
if signal_peptide
|
539
|
-
$stdout.write "Nterm\t#{signal_peptide}\t#{protein_info.name}\t#{protein_seq}\n"
|
540
|
-
raw_signal_peptide=signal_peptide.sub(/\(cleaved\)/,"")
|
541
|
-
# Get raw signal_peptide sequence
|
542
|
-
|
543
|
-
signal_peptide_coords=get_peptide_coordinates(protein_seq,raw_signal_peptide,protein_info,dna_sequence)
|
544
|
-
if signal_peptide_coords
|
545
|
-
signal_peptide_coords.each do |spcoords|
|
546
|
-
signal_peptide_gff = generate_fragment_gffs_for_coords(spcoords,protein_info,pep_id,raw_signal_peptide,genomedb,"signalpeptide")
|
547
|
-
gff_records += signal_peptide_gff
|
548
|
-
end
|
549
|
-
end
|
550
|
-
end
|
551
|
-
end
|
552
|
-
|
553
|
-
def peptide_gff_is_duplicate(peptide_gff,peptides_covered_genome)
|
554
|
-
nameindex = peptide_gff.attributes.index {|obj| obj[0]=="Name" }
|
555
|
-
pep_seq = peptide_gff.attributes[nameindex][1]
|
556
|
-
existing = peptides_covered_genome[pep_seq]
|
557
|
-
return true if existing==peptide_gff.start
|
558
|
-
|
559
|
-
return false
|
560
|
-
end
|
561
|
-
|
562
|
-
proteins = parse_proteins(tool.protxml)
|
65
|
+
proteins = parse_proteins(input_proxml)
|
563
66
|
fastadb = prepare_fasta(tool.database,'prot')
|
564
67
|
genomedb = nil
|
565
68
|
if tool.genome
|
@@ -583,7 +86,7 @@ for prot in proteins
|
|
583
86
|
end
|
584
87
|
|
585
88
|
# Gets identifiers of all proteins (includeing indistinguishable ones)
|
586
|
-
prot_names=protein_names(prot)
|
89
|
+
prot_names=tool.protein_names(prot)
|
587
90
|
|
588
91
|
|
589
92
|
if tool.protein_find!=nil
|
@@ -591,19 +94,19 @@ for prot in proteins
|
|
591
94
|
end
|
592
95
|
|
593
96
|
|
594
|
-
peptides=peptide_nodes(prot)
|
97
|
+
peptides=tool.peptide_nodes(prot)
|
595
98
|
entries_covered=[]
|
596
99
|
for protein_name in prot_names
|
597
100
|
protein_count += 1
|
598
101
|
prot_id = "pr#{protein_count.to_s}"
|
599
102
|
begin
|
600
103
|
|
601
|
-
protein_fasta_entry = get_fasta_record(protein_name,fastadb)
|
602
|
-
protein_info = cds_info_from_fasta(protein_fasta_entry)
|
104
|
+
protein_fasta_entry = tool.get_fasta_record(protein_name,fastadb)
|
105
|
+
protein_info = tool.cds_info_from_fasta(protein_fasta_entry)
|
603
106
|
|
604
|
-
unless (tool.collapse_redundant_proteins && !is_new_genome_location(protein_info,entries_covered) )
|
107
|
+
unless (tool.collapse_redundant_proteins && !tool.is_new_genome_location(protein_info,entries_covered) )
|
605
108
|
|
606
|
-
protein_gff = generate_protein_gff(protein_name,protein_info,prot_prob,protein_count)
|
109
|
+
protein_gff = tool.generate_protein_gff(protein_name,protein_info,prot_prob,protein_count)
|
607
110
|
|
608
111
|
gff_db.records += ["##gff-version 3\n","##sequence-region #{protein_info.scaffold} 1 160\n",protein_gff]
|
609
112
|
|
@@ -624,15 +127,15 @@ for prot in proteins
|
|
624
127
|
dna_sequence=nil
|
625
128
|
if !protein_info.is_sixframe
|
626
129
|
throw "A genome is required if predicted transcripts are to be mapped" unless genomedb!=nil
|
627
|
-
dna_sequence = get_dna_sequence(protein_info,genomedb)
|
130
|
+
dna_sequence = tool.get_dna_sequence(protein_info,genomedb)
|
628
131
|
end
|
629
132
|
|
630
133
|
|
631
|
-
peptide_gff = generate_gff_for_peptide_mapped_to_protein(prot_seq,pep_seq,protein_info,prot_id,pprob,peptide_count,dna_sequence,genomedb)
|
134
|
+
peptide_gff = tool.generate_gff_for_peptide_mapped_to_protein(prot_seq,pep_seq,protein_info,prot_id,pprob,peptide_count,dna_sequence,genomedb)
|
632
135
|
|
633
|
-
unless (peptide_gff.length==0 || peptide_gff_is_duplicate(peptide_gff[0],peptides_covered_genome))
|
136
|
+
unless (peptide_gff.length==0 || tool.peptide_gff_is_duplicate(peptide_gff[0],peptides_covered_genome))
|
634
137
|
|
635
|
-
add_putative_nterm_to_gff(peptide_gff,pep_seq,prot_seq,protein_info,prot_id,peptide_count,dna_sequence,genomedb)
|
138
|
+
tool.add_putative_nterm_to_gff(peptide_gff,pep_seq,prot_seq,protein_info,prot_id,peptide_count,dna_sequence,genomedb)
|
636
139
|
|
637
140
|
gff_db.records += peptide_gff
|
638
141
|
|
data/bin/protxml_to_table.rb
CHANGED
@@ -19,23 +19,9 @@ include LibXML
|
|
19
19
|
tool=Tool.new([:explicit_output])
|
20
20
|
tool.option_parser.banner = "Convert a protXML file to a tab delimited table.\n\nUsage: protxml_to_table.rb [options] file1.protXML"
|
21
21
|
|
22
|
-
tool.
|
23
|
-
tool.option_parser.on("--groups","Print output by groups rather than for each protein") do
|
24
|
-
tool.options.groups=true
|
25
|
-
end
|
26
|
-
|
27
|
-
# tool.options.proteinid_regex=".*?\|.*?\|(.*)"
|
28
|
-
# tool.option_parser.on( '--regex rexpr', 'Regex' ) do |regex|
|
29
|
-
# tool.options.proteinid_regex=regex
|
30
|
-
# end
|
22
|
+
tool.add_boolean_option(:groups,false,["--groups","Print output by groups rather than for each protein"])
|
31
23
|
|
32
|
-
exit unless tool.check_options
|
33
|
-
|
34
|
-
if ( ARGV[0].nil? )
|
35
|
-
puts "You must supply an input file"
|
36
|
-
puts tool.option_parser
|
37
|
-
exit
|
38
|
-
end
|
24
|
+
exit unless tool.check_options(true)
|
39
25
|
|
40
26
|
input_file=ARGV[0]
|
41
27
|
|