protk 1.2.6.pre5 → 1.3.0.pre1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/README.md +84 -45
- data/bin/add_retention_times.rb +9 -5
- data/bin/augustus_to_proteindb.rb +7 -11
- data/bin/interprophet.rb +28 -46
- data/bin/make_decoy.rb +16 -48
- data/bin/mascot_search.rb +57 -71
- data/bin/mascot_to_pepxml.rb +13 -26
- data/bin/msgfplus_search.rb +70 -107
- data/bin/omssa_search.rb +52 -109
- data/bin/peptide_prophet.rb +44 -119
- data/bin/pepxml_to_table.rb +24 -27
- data/bin/protein_prophet.rb +22 -82
- data/bin/protxml_to_gff.rb +22 -519
- data/bin/protxml_to_table.rb +2 -16
- data/bin/sixframe.rb +10 -32
- data/bin/tandem_search.rb +30 -403
- data/bin/tandem_to_pepxml.rb +43 -0
- data/bin/unimod_to_loc.rb +1 -1
- data/ext/{protk/decoymaker → decoymaker}/decoymaker.c +74 -21
- data/ext/decoymaker/extconf.rb +3 -0
- data/lib/protk/constants.rb +16 -2
- data/lib/protk/data/default_config.yml +2 -1
- data/lib/protk/data/tandem_gpm_defaults.xml +175 -0
- data/lib/protk/data/tandem_isb_kscore_defaults.xml +123 -0
- data/lib/protk/data/tandem_isb_native_defaults.xml +123 -0
- data/lib/protk/data/tandem_params.xml +17 -54
- data/lib/protk/fastadb.rb +2 -2
- data/lib/protk/prophet_tool.rb +1 -1
- data/lib/protk/protxml_to_gff_tool.rb +474 -0
- data/lib/protk/search_tool.rb +58 -103
- data/lib/protk/setup_rakefile.rake +9 -5
- data/lib/protk/tandem_search_tool.rb +256 -0
- data/lib/protk/tool.rb +85 -104
- data/lib/protk.rb +1 -6
- metadata +24 -103
- data/bin/annotate_ids.rb +0 -59
- data/bin/asapratio.rb +0 -27
- data/bin/blastxml_to_table.rb +0 -119
- data/bin/correct_omssa_retention_times.rb +0 -27
- data/bin/feature_finder.rb +0 -95
- data/bin/file_convert.rb +0 -164
- data/bin/generate_omssa_loc.rb +0 -42
- data/bin/gffmerge.rb +0 -208
- data/bin/libra.rb +0 -70
- data/bin/toppas_pipeline.rb +0 -84
- data/bin/uniprot_annotation.rb +0 -141
- data/bin/xls_to_table.rb +0 -52
- data/bin/xpress.rb +0 -27
- data/ext/protk/decoymaker/extconf.rb +0 -3
- data/ext/protk/simplealign/extconf.rb +0 -3
- data/lib/protk/biotools_excel_converter.rb +0 -60
- data/lib/protk/eupathdb_gene_information_table.rb +0 -158
- data/lib/protk/gapped_aligner.rb +0 -264
- data/lib/protk/protein_annotator.rb +0 -646
- data/lib/protk/spreadsheet_extensions.rb +0 -79
- data/lib/protk/xtandem_defaults.rb +0 -11
@@ -0,0 +1,474 @@
|
|
1
|
+
require 'protk/tool'
|
2
|
+
|
3
|
+
class ProtXMLToGFFTool < Tool
|
4
|
+
|
5
|
+
def initialize
|
6
|
+
|
7
|
+
super([:explicit_output])
|
8
|
+
|
9
|
+
@option_parser.banner = "Create a gff containing peptide Observations.\n\nUsage: protxml_to_gff.rb "
|
10
|
+
|
11
|
+
add_value_option(:database,nil,['-d filename','--database filename','Database used for ms/ms searches (Fasta Format)'])
|
12
|
+
add_value_option(:genome,nil,['-g filename','--genome filename', 'Nucleotide sequences for scaffolds (Fasta Format)'])
|
13
|
+
add_value_option(:protein_find,nil,['-f term','--find term', 'Restrict output to proteins whose name matches the specified string'])
|
14
|
+
add_value_option(:nterm_minlen,7,['-n len','--nterm-min-len len', 'Only include inferred N-terminal sequences if longer than len'])
|
15
|
+
add_boolean_option(:skip_fasta_indexing,false,['--skip-index','Don\'t index database (Index should already exist)'])
|
16
|
+
add_boolean_option(:stack_charge_states,false,['--stack-charge-states','Different peptide charge states get separate gff entries'])
|
17
|
+
add_boolean_option(:collapse_redundant_proteins,false,['--collapse-redundant-proteins','Proteins that cover genomic regions already covered will be skipped'])
|
18
|
+
add_value_option(:peptide_probability_threshold,0.95,['--threshold prob','Peptide Probability Threshold (Default 0.95)'])
|
19
|
+
add_value_option(:protein_probability_threshold,0.99,['--prot-threshold prob','Protein Probability Threshold (Default 0.99)'])
|
20
|
+
|
21
|
+
end
|
22
|
+
|
23
|
+
|
24
|
+
def protein_names(protein_node)
|
25
|
+
indis_proteins = protein_node.find('protxml:indistinguishable_protein','protxml:http://regis-web.systemsbiology.net/protXML')
|
26
|
+
prot_names = [protein_node['protein_name']]
|
27
|
+
for protein in indis_proteins
|
28
|
+
prot_names += [protein['protein_name']]
|
29
|
+
end
|
30
|
+
prot_names
|
31
|
+
end
|
32
|
+
|
33
|
+
def peptide_nodes(protein_node)
|
34
|
+
return protein_node.find('protxml:peptide','protxml:http://regis-web.systemsbiology.net/protXML')
|
35
|
+
end
|
36
|
+
|
37
|
+
|
38
|
+
def get_fasta_record(protein_name,fastadb)
|
39
|
+
# puts "Looking up #{protein_name}"
|
40
|
+
entry = fastadb.get_by_id protein_name
|
41
|
+
if ( entry == nil)
|
42
|
+
puts "Failed lookup for #{protein_name}"
|
43
|
+
raise KeyError
|
44
|
+
end
|
45
|
+
entry
|
46
|
+
end
|
47
|
+
|
48
|
+
class CDSInfo
|
49
|
+
attr_accessor :fasta_id
|
50
|
+
attr_accessor :strand
|
51
|
+
attr_accessor :frame
|
52
|
+
attr_accessor :name
|
53
|
+
attr_accessor :scaffold
|
54
|
+
attr_accessor :start
|
55
|
+
attr_accessor :end
|
56
|
+
attr_accessor :coding_sequences
|
57
|
+
attr_accessor :is_sixframe
|
58
|
+
attr_accessor :gene_id
|
59
|
+
|
60
|
+
def overlap(candidate_entry)
|
61
|
+
return false if candidate_entry.scaffold!=self.scaffold
|
62
|
+
return false if strand!=self.strand
|
63
|
+
return false if candidate_entry.start >= self.end
|
64
|
+
return false if self.start <= candidate_entry.end
|
65
|
+
return true
|
66
|
+
end
|
67
|
+
|
68
|
+
end
|
69
|
+
|
70
|
+
def cds_info_from_fasta(fasta_entry)
|
71
|
+
info=CDSInfo.new
|
72
|
+
info.fasta_id=fasta_entry
|
73
|
+
positions = fasta_entry.identifiers.description.split(' ').collect { |coords| coords.split('|').collect {|pos| pos.to_i} }
|
74
|
+
info.coding_sequences=[]
|
75
|
+
info.gene_id
|
76
|
+
if ( positions.length < 1 )
|
77
|
+
raise EncodingError
|
78
|
+
elsif ( positions.length > 1)
|
79
|
+
info.coding_sequences = positions[1..-1]
|
80
|
+
end
|
81
|
+
|
82
|
+
info.start = positions[0][0]
|
83
|
+
info.end = positions[0][1]
|
84
|
+
|
85
|
+
info.scaffold=fasta_entry.entry_id.scan(/(scaffold_?\d+)_/)[0][0]
|
86
|
+
info.name = fasta_entry.entry_id.scan(/lcl\|(.*)/)[0][0]
|
87
|
+
|
88
|
+
if fasta_entry.entry_id =~ /frame/
|
89
|
+
info.frame=info.name.scan(/frame_(\d)/)[0][0]
|
90
|
+
info.strand = (info.frame.to_i > 3) ? '-' : '+'
|
91
|
+
info.is_sixframe = true
|
92
|
+
else
|
93
|
+
info.strand = (info.name =~ /rev/) ? '-' : '+'
|
94
|
+
info.gene_id=info.name.scan(/_\w{3}_(.*)\.t/)[0][0]
|
95
|
+
info.is_sixframe = false
|
96
|
+
end
|
97
|
+
info
|
98
|
+
end
|
99
|
+
|
100
|
+
|
101
|
+
def is_new_genome_location(candidate_entry,existing_entries)
|
102
|
+
# puts existing_entries
|
103
|
+
# require 'debugger';debugger
|
104
|
+
|
105
|
+
# genes=existing_entries.collect { |e| e.gene_id }.compact
|
106
|
+
|
107
|
+
# if genes.include?(candidate_entry.gene_id)
|
108
|
+
# return false
|
109
|
+
# end
|
110
|
+
|
111
|
+
existing_entries.each do |existing|
|
112
|
+
return false if existing.gene_id==candidate_entry.gene_id
|
113
|
+
return false if existing.overlap(candidate_entry)
|
114
|
+
end
|
115
|
+
|
116
|
+
return true
|
117
|
+
end
|
118
|
+
|
119
|
+
|
120
|
+
def generate_protein_gff(protein_name,entry_info,prot_prob,prot_id)
|
121
|
+
prot_qualifiers = {"source" => "MSMS", "score" => prot_prob, "ID" => prot_id}
|
122
|
+
prot_attributes = [["ID",prot_id],["Name",entry_info.name]]
|
123
|
+
prot_gff_line = Bio::GFF::GFF3::Record.new(seqid = entry_info.scaffold,source="MSMS",feature_type="protein",
|
124
|
+
start_position=entry_info.start,end_position=entry_info.end,score=prot_prob,strand=entry_info.strand,frame=nil,attributes=prot_attributes)
|
125
|
+
prot_gff_line
|
126
|
+
end
|
127
|
+
|
128
|
+
def get_dna_sequence(protein_info,genomedb)
|
129
|
+
|
130
|
+
scaffold_sequence = get_fasta_record(protein_info.scaffold,genomedb)
|
131
|
+
gene_sequence = scaffold_sequence.naseq.to_s[(protein_info.start-1)..protein_info.end]
|
132
|
+
|
133
|
+
if ( protein_info.strand == "-")
|
134
|
+
gene_sequence = Bio::Sequence::NA.new(gene_sequence).reverse_complement
|
135
|
+
end
|
136
|
+
|
137
|
+
gene_sequence
|
138
|
+
end
|
139
|
+
|
140
|
+
def peptide_is_in_sixframe(pep_seq,gene_seq)
|
141
|
+
gs=Bio::Sequence::NA.new(gene_seq)
|
142
|
+
(1..6).each do |frame|
|
143
|
+
if gs.translate(frame).index(pep_seq)
|
144
|
+
return true
|
145
|
+
end
|
146
|
+
end
|
147
|
+
return false
|
148
|
+
end
|
149
|
+
|
150
|
+
def fragment_coords_from_protein_coords(pepstart,pepend,gene_start,gene_end,coding_sequences)
|
151
|
+
|
152
|
+
sorted_cds = coding_sequences.sort { |a, b| a[0] <=> b[0] }
|
153
|
+
|
154
|
+
|
155
|
+
# Assume positive strand
|
156
|
+
pi_start=pepstart*3+gene_start-1
|
157
|
+
pi_end=pepend*3+gene_start-1
|
158
|
+
|
159
|
+
fragments=[]
|
160
|
+
p_i = pi_start #Initially we are looking for the first fragment
|
161
|
+
finding_start=true
|
162
|
+
|
163
|
+
sorted_cds.each_with_index do |cds_coords, i|
|
164
|
+
cds_start=cds_coords[0]
|
165
|
+
cds_end = cds_coords[1]
|
166
|
+
if cds_end < p_i # Exon is before index in sequence and doesn't contain p_i
|
167
|
+
if sorted_cds.length <= i+1
|
168
|
+
require 'debugger';debugger
|
169
|
+
end
|
170
|
+
|
171
|
+
next_coords = sorted_cds[i+1]
|
172
|
+
intron_offset = ((next_coords[0]-cds_end)-1)
|
173
|
+
p_i+=intron_offset
|
174
|
+
pi_end+=intron_offset
|
175
|
+
if !finding_start
|
176
|
+
# This is a middle exon
|
177
|
+
fragments << [cds_start,cds_end]
|
178
|
+
end
|
179
|
+
else
|
180
|
+
if finding_start
|
181
|
+
|
182
|
+
if ( pi_end <= cds_end) #Whole peptide contained in a single exon
|
183
|
+
fragments << [p_i+1,pi_end]
|
184
|
+
break;
|
185
|
+
end
|
186
|
+
|
187
|
+
|
188
|
+
fragments << [p_i+1,(cds_end)]
|
189
|
+
next_coords = sorted_cds[i+1]
|
190
|
+
intron_offset = ((next_coords[0]-cds_end)-1)
|
191
|
+
p_i+=intron_offset
|
192
|
+
pi_end+=intron_offset
|
193
|
+
p_i = pi_end
|
194
|
+
finding_start=false
|
195
|
+
else # A terminal exon
|
196
|
+
# require 'debugger';debugger
|
197
|
+
fragments << [(cds_start),(p_i)]
|
198
|
+
break;
|
199
|
+
end
|
200
|
+
end
|
201
|
+
end
|
202
|
+
[fragments]
|
203
|
+
end
|
204
|
+
|
205
|
+
# gene_seq should already have been reverse_complemented if on reverse strand
|
206
|
+
def get_peptide_coordinates_from_transcript_info(prot_seq,pep_seq,protein_info,gene_seq)
|
207
|
+
# if ( peptide_is_in_sixframe(pep_seq,gene_seq))
|
208
|
+
# Peptide is in 6-frame but on a predicted transcript
|
209
|
+
# return nil
|
210
|
+
# else
|
211
|
+
|
212
|
+
# puts "Found a gap #{protein_info.fasta_id}"
|
213
|
+
if protein_info.strand=='-'
|
214
|
+
pep_index = prot_seq.reverse.index(pep_seq.reverse)
|
215
|
+
if pep_index==nil
|
216
|
+
# require 'debugger';debugger
|
217
|
+
puts "Warning: Unable to find peptide #{pep_seq} in this protein! #{protein_info}"
|
218
|
+
return nil
|
219
|
+
end
|
220
|
+
pep_start_i = prot_seq.reverse.index(pep_seq.reverse)+1
|
221
|
+
# Plus 1 because on reverse stand stop-codon will be at the beginning of the sequence (when read forwards). Need to eliminate it.
|
222
|
+
else
|
223
|
+
pep_start_i = prot_seq.index(pep_seq)
|
224
|
+
if pep_start_i==nil
|
225
|
+
# require 'debugger';debugger
|
226
|
+
puts "Warning: Unable to find peptide #{pep_seq} in this protein! #{protein_info}"
|
227
|
+
return nil
|
228
|
+
end
|
229
|
+
end
|
230
|
+
pep_end_i = pep_start_i+pep_seq.length
|
231
|
+
|
232
|
+
return fragment_coords_from_protein_coords(pep_start_i,pep_end_i,protein_info.start,protein_info.end,protein_info.coding_sequences)
|
233
|
+
# end
|
234
|
+
end
|
235
|
+
|
236
|
+
def get_peptide_coordinates_sixframe(prot_seq,pep_seq,protein_info)
|
237
|
+
|
238
|
+
if ( protein_info.strand == '-' )
|
239
|
+
prot_seq = prot_seq.reverse
|
240
|
+
pep_seq = pep_seq.reverse
|
241
|
+
end
|
242
|
+
|
243
|
+
start_indexes = [0]
|
244
|
+
|
245
|
+
prot_seq.scan /#{pep_seq}/ do |match|
|
246
|
+
start_indexes << prot_seq.index(match,start_indexes.last)
|
247
|
+
end
|
248
|
+
start_indexes.delete_at(0)
|
249
|
+
|
250
|
+
start_indexes.collect do |si|
|
251
|
+
pep_genomic_start = protein_info.start + 3*si
|
252
|
+
pep_genomic_end = pep_genomic_start + 3*pep_seq.length - 1
|
253
|
+
[[pep_genomic_start,pep_genomic_end]]
|
254
|
+
end
|
255
|
+
|
256
|
+
end
|
257
|
+
|
258
|
+
# Returns a 4-mer [genomic_start,fragment1_end(or0),frag2_start(or0),genomic_end]
|
259
|
+
def get_peptide_coordinates(prot_seq,pep_seq,protein_info,gene_seq)
|
260
|
+
if ( protein_info.is_sixframe)
|
261
|
+
return get_peptide_coordinates_sixframe(prot_seq,pep_seq,protein_info)
|
262
|
+
else
|
263
|
+
return get_peptide_coordinates_from_transcript_info(prot_seq,pep_seq,protein_info,gene_seq)
|
264
|
+
end
|
265
|
+
end
|
266
|
+
|
267
|
+
|
268
|
+
def generate_fragment_gffs_for_coords(coords,protein_info,pep_id,peptide_seq,genomedb,name="fragment")
|
269
|
+
scaff = get_fasta_record(protein_info.scaffold,genomedb)
|
270
|
+
scaffold_seq = Bio::Sequence::NA.new(scaff.seq)
|
271
|
+
|
272
|
+
fragment_phase = 0
|
273
|
+
ordered_coords= protein_info.strand=='+' ? coords : coords.reverse
|
274
|
+
if name=="CDS"
|
275
|
+
frag_id="#{pep_id}.fg"
|
276
|
+
else
|
277
|
+
frag_id="#{pep_id}.sp"
|
278
|
+
end
|
279
|
+
gff_lines = ordered_coords.collect do |frag_start,frag_end|
|
280
|
+
frag_naseq = scaffold_seq[frag_start-1..frag_end-1]
|
281
|
+
|
282
|
+
begin
|
283
|
+
frag_frame = fragment_phase+1
|
284
|
+
frag_seq = nil
|
285
|
+
if ( protein_info.strand=='-')
|
286
|
+
frag_seq = frag_naseq.reverse_complement.translate(frag_frame)
|
287
|
+
else
|
288
|
+
frag_seq = frag_naseq.translate(frag_frame)
|
289
|
+
end
|
290
|
+
rescue
|
291
|
+
if frag_naseq.length > 1
|
292
|
+
puts "Unable to translate #{frag_naseq}"
|
293
|
+
# require 'debugger'
|
294
|
+
end
|
295
|
+
frag_seq="*"
|
296
|
+
end
|
297
|
+
|
298
|
+
fragment_record=Bio::GFF::GFF3::Record.new(seqid = protein_info.scaffold,source="MSMS",
|
299
|
+
feature_type=name,start_position=frag_start,end_position=frag_end,score='',
|
300
|
+
strand=protein_info.strand,frame=fragment_phase,attributes=[["Parent",pep_id],["ID",frag_id],["Name",frag_seq]])
|
301
|
+
|
302
|
+
|
303
|
+
remainder=(frag_naseq.length-fragment_phase) % 3
|
304
|
+
fragment_phase=(3-remainder) % 3
|
305
|
+
|
306
|
+
fragment_record
|
307
|
+
end
|
308
|
+
|
309
|
+
|
310
|
+
concat_seq=nil
|
311
|
+
|
312
|
+
coords.each do |frag_start,frag_end|
|
313
|
+
frag_naseq = scaffold_seq[frag_start-1..frag_end-1]
|
314
|
+
concat_seq += frag_naseq unless concat_seq == nil
|
315
|
+
concat_seq = frag_naseq if concat_seq==nil
|
316
|
+
end
|
317
|
+
|
318
|
+
check_seq = protein_info.strand=='-' ? concat_seq.reverse_complement.translate : concat_seq.translate
|
319
|
+
if ( check_seq != peptide_seq)
|
320
|
+
require 'debugger';debugger
|
321
|
+
puts "Fragment seqs not equal to peptide seqs"
|
322
|
+
end
|
323
|
+
|
324
|
+
return gff_lines
|
325
|
+
|
326
|
+
end
|
327
|
+
|
328
|
+
def get_start_codon_coords_for_peptide(peptide_genomic_start,peptide_genomic_end,peptide_seq,protein_seq,strand)
|
329
|
+
pi=protein_seq.index(peptide_seq)
|
330
|
+
if ( protein_seq[pi]=='M' )
|
331
|
+
is_tryptic=false
|
332
|
+
if ( pi>0 && (protein_seq[pi-1]!='K' && protein_seq[pi-1]!='R') )
|
333
|
+
is_tryptic=true
|
334
|
+
elsif (pi==0)
|
335
|
+
is_tryptic=true
|
336
|
+
end
|
337
|
+
return nil unless is_tryptic
|
338
|
+
|
339
|
+
start_codon_coord = (strand=='+') ? peptide_genomic_start : peptide_genomic_end-2
|
340
|
+
# require 'debugger';debugger
|
341
|
+
return [start_codon_coord,start_codon_coord+2]
|
342
|
+
else
|
343
|
+
return nil
|
344
|
+
end
|
345
|
+
end
|
346
|
+
|
347
|
+
def get_cterm_coords_for_peptide(peptide_genomic_start,peptide_genomic_end,peptide_seq,protein_seq,strand)
|
348
|
+
|
349
|
+
if ( (peptide_seq[-1]!='K' && peptide_seq[-1]!='R' ) )
|
350
|
+
|
351
|
+
codon_coord = (strand=='+') ? peptide_genomic_end-3 : peptide_genomic_start+1
|
352
|
+
# require 'debugger';debugger
|
353
|
+
return [codon_coord,codon_coord+2]
|
354
|
+
else
|
355
|
+
return nil
|
356
|
+
end
|
357
|
+
end
|
358
|
+
|
359
|
+
|
360
|
+
def get_nterm_peptide_for_peptide(peptide_seq,protein_seq)
|
361
|
+
pi=protein_seq.index(peptide_seq)
|
362
|
+
if ( pi>0 && (protein_seq[pi-1]!='K' && protein_seq[pi-1]!='R' && protein_seq[pi]!='M') )
|
363
|
+
# Since trypsin sometimes cleaves before P (ie breaking the rule)
|
364
|
+
# we don't check for it and assume those cases are real tryptic termini
|
365
|
+
reverse_leader_seq=protein_seq[0..pi].reverse
|
366
|
+
mi=reverse_leader_seq.index('M')
|
367
|
+
|
368
|
+
if ( mi==nil )
|
369
|
+
puts "No methionine found ahead of peptide sequence. Unable to determine n-term sequence"
|
370
|
+
return nil
|
371
|
+
end
|
372
|
+
|
373
|
+
mi=pi-mi
|
374
|
+
|
375
|
+
ntermseq=protein_seq[mi..(pi-1)]
|
376
|
+
|
377
|
+
# if ( ntermseq.length < minlen )
|
378
|
+
# return nil
|
379
|
+
# end
|
380
|
+
|
381
|
+
# $STDOUT.write protein_seq[mi..(pi+peptide_seq.length-1)]
|
382
|
+
# require 'debugger';debugger
|
383
|
+
full_seq_with_annotations = "#{ntermseq}(cleaved)#{protein_seq[(pi..(pi+peptide_seq.length-1))]}"
|
384
|
+
|
385
|
+
return full_seq_with_annotations
|
386
|
+
else
|
387
|
+
return nil
|
388
|
+
end
|
389
|
+
end
|
390
|
+
|
391
|
+
def generate_gff_for_peptide_mapped_to_protein(protein_seq,peptide_seq,protein_info,prot_id,peptide_prob,peptide_count,dna_sequence,genomedb=nil)
|
392
|
+
|
393
|
+
prot_seq = protein_seq
|
394
|
+
pep_seq = peptide_seq
|
395
|
+
|
396
|
+
|
397
|
+
peptide_coords = get_peptide_coordinates(prot_seq,pep_seq,protein_info,dna_sequence)
|
398
|
+
|
399
|
+
if ( peptide_coords==nil ) # Return value of nil means the entry is a predicted transcript that should already be covered by 6-frame
|
400
|
+
return []
|
401
|
+
end
|
402
|
+
|
403
|
+
gff_records=[]
|
404
|
+
|
405
|
+
# Now convert peptide coordinate to genome coordinates
|
406
|
+
# And create gff lines for each match
|
407
|
+
peptide_coords.each do |coords|
|
408
|
+
|
409
|
+
# require 'debugger';debugger
|
410
|
+
pep_genomic_start = coords.first[0]
|
411
|
+
pep_genomic_end = coords.last[1]
|
412
|
+
|
413
|
+
pep_id = "#{prot_id}.p#{peptide_count.to_s}"
|
414
|
+
pep_attributes = [["ID",pep_id],["Parent",prot_id],["Name",pep_seq]]
|
415
|
+
|
416
|
+
pep_gff_line = Bio::GFF::GFF3::Record.new(seqid = protein_info.scaffold,source="MSMS",
|
417
|
+
feature_type="peptide",start_position=pep_genomic_start,end_position=pep_genomic_end,score=peptide_prob,
|
418
|
+
strand=protein_info.strand,frame=nil,attributes=pep_attributes)
|
419
|
+
|
420
|
+
# For standard peptides
|
421
|
+
frag_gffs = generate_fragment_gffs_for_coords(coords,protein_info,pep_id,peptide_seq,genomedb,"CDS")
|
422
|
+
gff_records += [pep_gff_line] + frag_gffs
|
423
|
+
# require 'debugger';debugger
|
424
|
+
# For peptides with only 1 tryptic terminus
|
425
|
+
start_codon_coords=get_start_codon_coords_for_peptide(pep_genomic_start,pep_genomic_end,peptide_seq,protein_seq,protein_info.strand)
|
426
|
+
if start_codon_coords
|
427
|
+
start_codon_gff = Bio::GFF::GFF3::Record.new(seqid = protein_info.scaffold,source="MSMS",
|
428
|
+
feature_type="start_codon",start_position=start_codon_coords[0],end_position=start_codon_coords[1],score='',
|
429
|
+
strand=protein_info.strand,frame=nil,attributes=["Parent",pep_id])
|
430
|
+
gff_records+=[start_codon_gff]
|
431
|
+
end
|
432
|
+
|
433
|
+
cterm_coords = get_cterm_coords_for_peptide(pep_genomic_start,pep_genomic_end,peptide_seq,protein_seq,protein_info.strand)
|
434
|
+
if ( cterm_coords )
|
435
|
+
cterm_gff = Bio::GFF::GFF3::Record.new(seqid = protein_info.scaffold,source="MSMS",
|
436
|
+
feature_type="cterm",start_position=cterm_coords[0],end_position=cterm_coords[1],score='',
|
437
|
+
strand=protein_info.strand,frame=nil,attributes=["Parent",pep_id])
|
438
|
+
gff_records+=[start_codon_gff]
|
439
|
+
end
|
440
|
+
|
441
|
+
end
|
442
|
+
# puts gff_records
|
443
|
+
|
444
|
+
gff_records
|
445
|
+
end
|
446
|
+
|
447
|
+
def add_putative_nterm_to_gff(gff_records,peptide_seq,protein_seq,protein_info,prot_id,peptide_count,dna_sequence,genomedb)
|
448
|
+
pep_id = "#{prot_id}.p#{peptide_count.to_s}"
|
449
|
+
signal_peptide = get_nterm_peptide_for_peptide(peptide_seq,protein_seq)
|
450
|
+
if signal_peptide
|
451
|
+
$stdout.write "Nterm\t#{signal_peptide}\t#{protein_info.name}\t#{protein_seq}\n"
|
452
|
+
raw_signal_peptide=signal_peptide.sub(/\(cleaved\)/,"")
|
453
|
+
# Get raw signal_peptide sequence
|
454
|
+
|
455
|
+
signal_peptide_coords=get_peptide_coordinates(protein_seq,raw_signal_peptide,protein_info,dna_sequence)
|
456
|
+
if signal_peptide_coords
|
457
|
+
signal_peptide_coords.each do |spcoords|
|
458
|
+
signal_peptide_gff = generate_fragment_gffs_for_coords(spcoords,protein_info,pep_id,raw_signal_peptide,genomedb,"signalpeptide")
|
459
|
+
gff_records += signal_peptide_gff
|
460
|
+
end
|
461
|
+
end
|
462
|
+
end
|
463
|
+
end
|
464
|
+
|
465
|
+
def peptide_gff_is_duplicate(peptide_gff,peptides_covered_genome)
|
466
|
+
nameindex = peptide_gff.attributes.index {|obj| obj[0]=="Name" }
|
467
|
+
pep_seq = peptide_gff.attributes[nameindex][1]
|
468
|
+
existing = peptides_covered_genome[pep_seq]
|
469
|
+
return true if existing==peptide_gff.start
|
470
|
+
|
471
|
+
return false
|
472
|
+
end
|
473
|
+
|
474
|
+
end
|