protk 1.2.4 → 1.2.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -17,9 +17,10 @@ input_stager = nil
17
17
 
18
18
  # Setup specific command-line options for this tool. Other options are inherited from SearchTool
19
19
  #
20
- search_tool=SearchTool.new([:database,:explicit_output,:over_write,:enzyme,
20
+ search_tool=SearchTool.new([:background,:database,:explicit_output,:over_write,:enzyme,
21
21
  :modifications,:instrument,:mass_tolerance_units,:mass_tolerance,:missed_cleavages])
22
22
 
23
+ search_tool.jobid_prefix="p"
23
24
  search_tool.option_parser.banner = "Run an MSGFPlus msms search on a set of msms spectrum input files.\n\nUsage: msgfplus_search.rb [options] file1.mzML file2.mzML ..."
24
25
  search_tool.options.output_suffix="_msgfplus"
25
26
 
@@ -135,7 +136,7 @@ ARGV.each do |filename|
135
136
  if ( search_tool.explicit_output!=nil)
136
137
  output_path=search_tool.explicit_output
137
138
  else
138
- output_path="#{search_tool.output_base_path(filename.chomp)}.pepXML"
139
+ output_path="#{search_tool.output_base_path(filename.chomp)}.pep.xml"
139
140
  end
140
141
 
141
142
 
@@ -232,20 +233,28 @@ ARGV.each do |filename|
232
233
  # As a final part of the command we convert to pepxml
233
234
  if search_tool.no_pepxml
234
235
  cmd << "; cp #{mzid_output_path} #{output_path}"
235
- elsif search_tool.explicit_output
236
+ else
237
+ #if search_tool.explicit_output
236
238
  cmd << "; #{genv.idconvert} #{mzid_output_path} --pepXML -o #{Pathname.new(mzid_output_path).dirname}"
237
239
  #Then copy the pepxml to the final output path
238
- cmd << "; cp #{mzid_output_path.chomp('.mzid')}.pepXML #{output_path}"
240
+ cmd << "; mv #{mzid_output_path.chomp('.mzid')}.pepXML #{output_path}"
239
241
  end
240
242
 
241
243
 
242
244
  # Up to here we've formulated the command. The rest is cleanup
243
245
  p "Running:#{cmd}"
244
246
 
247
+ # In case the user specified background running we need to create a jobscript path
248
+ #
249
+ jobscript_path="#{output_path}.pbs.sh"
250
+
245
251
  # Run the search
246
252
  #
247
253
  job_params= {:jobid => search_tool.jobid_from_filename(filename) }
248
- search_tool.run(cmd,genv,job_params)
254
+ job_params[:queue]="seventytwo"
255
+ job_params[:vmem]="70gb"
256
+ code = search_tool.run(cmd,genv,job_params,jobscript_path)
257
+ throw "Command failed with exit code #{code}" unless code==0
249
258
 
250
259
  if for_galaxy
251
260
  input_stager.restore_references(output_path)
@@ -85,7 +85,12 @@ end
85
85
  prophet_tool.options.decoy_prefix="decoy"
86
86
  prophet_tool.option_parser.on( '--decoy-prefix prefix', 'Prefix for decoy sequences') do |prefix|
87
87
  prophet_tool.options.decoy_prefix = prefix
88
- end
88
+ end
89
+
90
+ prophet_tool.options.no_decoys = false
91
+ prophet_tool.option_parser.on( '--no-decoy', 'Don\'t use decoy sequences to pin down the negative distribution') do
92
+ prophet_tool.options.no_decoys = true
93
+ end
89
94
 
90
95
  prophet_tool.options.override_database=nil
91
96
  prophet_tool.option_parser.on( '--override-database database', 'Manually specify database') do |database|
@@ -207,12 +212,14 @@ def generate_command(genv,prophet_tool,inputs,output,database,engine)
207
212
  cmd << " -I2 -T3 -I4 -I5 -I6 -I7 "
208
213
  end
209
214
 
210
- if engine=="omssa" || engine=="phenyx"
211
- cmd << " -Op -P -d#{prophet_tool.decoy_prefix} "
212
- else
213
- cmd << " -d#{prophet_tool.decoy_prefix} "
214
- end
215
-
215
+ unless prophet_tool.no_decoys
216
+
217
+ if engine=="omssa" || engine=="phenyx"
218
+ cmd << " -Op -P -d#{prophet_tool.decoy_prefix} "
219
+ else
220
+ cmd << " -d#{prophet_tool.decoy_prefix} "
221
+ end
222
+ end
216
223
 
217
224
  if ( inputs.class==Array)
218
225
  cmd << " #{inputs.join(" ")}"
@@ -0,0 +1,624 @@
1
+ #!/usr/bin/env ruby
2
+ #
3
+ # This file is part of protk
4
+ # Original python version created by Max Grant
5
+ # Translated to ruby by Ira Cooke 29/1/2013
6
+ #
7
+ #
8
+
9
+ require 'protk/constants'
10
+ require 'protk/tool'
11
+ require 'protk/fastadb'
12
+ require 'protk/gapped_aligner'
13
+ require 'libxml'
14
+ require 'bio'
15
+
16
+ include LibXML
17
+
18
+ tool=Tool.new([:explicit_output])
19
+ tool.option_parser.banner = "Create a gff containing peptide Observations.\n\nUsage: protxml_to_gff.rb "
20
+
21
+
22
+ tool.options.protxml=nil
23
+ tool.option_parser.on( '-p filename','--protxml filename', 'Observed Data (ProtXML Format)' ) do |file|
24
+ tool.options.protxml=file
25
+ end
26
+
27
+ tool.options.database=nil
28
+ tool.option_parser.on( '-d filename','--database filename', 'Database used for ms/ms searches (Fasta Format)' ) do |file|
29
+ tool.options.database=file
30
+ end
31
+
32
+ tool.options.genome=nil
33
+ tool.option_parser.on( '-g filename','--genome filename', 'Nucleotide sequences for scaffolds (Fasta Format)' ) do |file|
34
+ tool.options.genome=file
35
+ end
36
+
37
+ tool.options.skip_fasta_indexing=false
38
+ tool.option_parser.on('--skip-index','Don\'t index database (Index should already exist)') do
39
+ tool.options.skip_fasta_indexing=true
40
+ end
41
+
42
+ tool.options.peptide_probability_threshold=0.95
43
+ tool.option_parser.on('--threshold prob','Peptide Probability Threshold (Default 0.95)') do |thresh|
44
+ tool.options.peptide_probability_threshold=thresh.to_f
45
+ end
46
+
47
+ exit unless tool.check_options [:protxml,:database]
48
+
49
+ gff_out_file="peptides.gff"
50
+ if ( tool.explicit_output != nil)
51
+ gff_out_file=tool.explicit_output
52
+ end
53
+
54
+ gff_db = Bio::GFF.new()
55
+ f = open(gff_out_file,'w+')
56
+
57
+
58
+ def parse_proteins(protxml_file)
59
+ puts "Parsing proteins from protxml"
60
+ protxml_parser=XML::Parser.file(protxml_file)
61
+ protxml_doc=protxml_parser.parse
62
+ proteins = protxml_doc.find('.//protxml:protein','protxml:http://regis-web.systemsbiology.net/protXML')
63
+ proteins
64
+ end
65
+
66
+ def prepare_fasta(database_path,type)
67
+ db_filename = nil
68
+ case
69
+ when Pathname.new(database_path).exist? # It's an explicitly named db
70
+ db_filename = Pathname.new(database_path).realpath.to_s
71
+ else
72
+ db_filename=Constants.new.current_database_for_name(database_path)
73
+ end
74
+
75
+ db_indexfilename = "#{db_filename}.pin"
76
+
77
+ if File.exist?(db_indexfilename)
78
+ puts "Using existing indexed database"
79
+ orf_lookup = FastaDB.new(db_filename)
80
+ else
81
+ puts "Indexing database"
82
+ orf_lookup = FastaDB.create(db_filename,db_filename,type)
83
+ end
84
+ orf_lookup
85
+ end
86
+
87
+ def protein_names(protein_node)
88
+ indis_proteins = protein_node.find('protxml:indistinguishable_protein','protxml:http://regis-web.systemsbiology.net/protXML')
89
+ prot_names = [protein_node['protein_name']]
90
+ for protein in indis_proteins
91
+ prot_names += [protein['protein_name']]
92
+ end
93
+ prot_names
94
+ end
95
+
96
+ def peptide_nodes(protein_node)
97
+ protein_node.find('protxml:peptide','protxml:http://regis-web.systemsbiology.net/protXML')
98
+ end
99
+
100
+
101
+ def get_fasta_record(protein_name,fastadb)
102
+ # puts "Looking up #{protein_name}"
103
+ entry = fastadb.get_by_id protein_name
104
+ if ( entry == nil)
105
+ puts "Failed lookup for #{protein_name}"
106
+ raise KeyError
107
+ end
108
+ entry
109
+ end
110
+
111
+ class CDSInfo
112
+ attr_accessor :fasta_id
113
+ attr_accessor :strand
114
+ attr_accessor :frame
115
+ attr_accessor :name
116
+ attr_accessor :scaffold
117
+ attr_accessor :start
118
+ attr_accessor :end
119
+ attr_accessor :coding_sequences
120
+ attr_accessor :is_sixframe
121
+ attr_accessor :gene_id
122
+
123
+ def overlap(candidate_entry)
124
+ return false if candidate_entry.scaffold!=self.scaffold
125
+ return false if strand!=self.strand
126
+ return false if candidate_entry.start >= self.end
127
+ return false if self.start <= candidate_entry.end
128
+ return true
129
+ end
130
+
131
+ end
132
+
133
+ def cds_info_from_fasta(fasta_entry)
134
+ info=CDSInfo.new
135
+ info.fasta_id=fasta_entry
136
+ positions = fasta_entry.identifiers.description.split(' ').collect { |coords| coords.split('|').collect {|pos| pos.to_i} }
137
+ info.coding_sequences=[]
138
+ info.gene_id
139
+ if ( positions.length < 1 )
140
+ raise EncodingError
141
+ elsif ( positions.length > 1)
142
+ info.coding_sequences = positions[1..-1]
143
+ end
144
+
145
+ info.start = positions[0][0]
146
+ info.end = positions[0][1]
147
+
148
+ info.scaffold=fasta_entry.entry_id.scan(/(scaffold_?\d+)_/)[0][0]
149
+ info.name = fasta_entry.entry_id.scan(/lcl\|(.*)/)[0][0]
150
+
151
+ if fasta_entry.entry_id =~ /frame/
152
+ info.frame=info.name.scan(/frame_(\d)/)[0][0]
153
+ info.strand = (info.frame.to_i > 3) ? '-' : '+'
154
+ info.is_sixframe = true
155
+ else
156
+ info.strand = (info.name =~ /rev/) ? '-' : '+'
157
+ info.gene_id=info.name.scan(/_\w{3}_(.*)\.t/)[0][0]
158
+ info.is_sixframe = false
159
+ end
160
+ info
161
+ end
162
+
163
+
164
+ def is_new_genome_location(candidate_entry,existing_entries)
165
+ # puts existing_entries
166
+ # require 'debugger';debugger
167
+
168
+ # genes=existing_entries.collect { |e| e.gene_id }.compact
169
+
170
+ # if genes.include?(candidate_entry.gene_id)
171
+ # return false
172
+ # end
173
+
174
+ existing_entries.each do |existing|
175
+ return false if existing.gene_id==candidate_entry.gene_id
176
+ return false if existing.overlap(candidate_entry)
177
+ end
178
+
179
+ return true
180
+ end
181
+
182
+
183
+ def generate_protein_gff(protein_name,entry_info,prot_prob,prot_id)
184
+ prot_qualifiers = {"source" => "MSMS", "score" => prot_prob, "ID" => prot_id}
185
+ prot_attributes = [["ID",prot_id],["Name",entry_info.name]]
186
+ prot_gff_line = Bio::GFF::GFF3::Record.new(seqid = entry_info.scaffold,source="MSMS",feature_type="protein",
187
+ start_position=entry_info.start,end_position=entry_info.end,score=prot_prob,strand=entry_info.strand,frame=nil,attributes=prot_attributes)
188
+ prot_gff_line
189
+ end
190
+
191
+ def get_dna_sequence(protein_info,genomedb)
192
+
193
+ scaffold_sequence = get_fasta_record(protein_info.scaffold,genomedb)
194
+ gene_sequence = scaffold_sequence.naseq.to_s[(protein_info.start-1)..protein_info.end]
195
+
196
+ if ( protein_info.strand == "-")
197
+ gene_sequence = Bio::Sequence::NA.new(gene_sequence).reverse_complement
198
+ end
199
+
200
+ gene_sequence
201
+ end
202
+
203
+ def peptide_is_in_sixframe(pep_seq,gene_seq)
204
+ gs=Bio::Sequence::NA.new(gene_seq)
205
+ (1..6).each do |frame|
206
+ if gs.translate(frame).index(pep_seq)
207
+ return true
208
+ end
209
+ end
210
+ return false
211
+ end
212
+
213
+ # gene_seq should already have been reverse_complemented if on reverse strand
214
+ def get_peptide_coordinates_by_alignment(prot_seq,pep_seq,protein_info,gene_seq)
215
+ if ( peptide_is_in_sixframe(pep_seq,gene_seq))
216
+ return nil
217
+ else
218
+ puts "Warning. Actually found a gap #{protein_info.fasta_id}"
219
+ aln=GappedAligner.new().align(pep_seq,gene_seq)
220
+ unless aln.gaps.length==1
221
+ puts "More than one intron.#{aln}"
222
+ require 'debugger';debugger
223
+ end
224
+ pep_coords = []
225
+ frags = aln.fragments
226
+ frags.reverse! if protein_info.strand=='-'
227
+
228
+ frags.each { |frag|
229
+ if protein_info.strand=='+'
230
+ frag_genomic_start = protein_info.start + frag[0]
231
+ frag_genomic_end = protein_info.start + frag[1]
232
+ else
233
+ frag_genomic_start = protein_info.end - frag[1]
234
+ frag_genomic_end = protein_info.end - frag[0]
235
+ end
236
+ pep_coords << frag_genomic_start
237
+ pep_coords << frag_genomic_end
238
+ }
239
+
240
+ return [pep_coords]
241
+ end
242
+ end
243
+
244
+ def fragment_coords_from_protein_coords(pepstart,pepend,gene_start,gene_end,coding_sequences)
245
+
246
+ sorted_cds = coding_sequences.sort { |a, b| a[0] <=> b[0] }
247
+
248
+ # Assume positive strand
249
+ pi_start=pepstart*3+gene_start-1
250
+ pi_end=pepend*3+gene_start-1
251
+
252
+ fragments=[]
253
+ p_i = pi_start #Initially we are looking for the first fragment
254
+ finding_start=true
255
+
256
+ sorted_cds.each_with_index do |cds_coords, i|
257
+ cds_start=cds_coords[0]
258
+ cds_end = cds_coords[1]
259
+ if cds_end < p_i # Exon is before index in sequence and doesn't contain p_i
260
+ if sorted_cds.length <= i+1
261
+ require 'debugger';debugger
262
+ end
263
+
264
+ next_coords = sorted_cds[i+1]
265
+ intron_offset = ((next_coords[0]-cds_end)-1)
266
+ p_i+=intron_offset
267
+ pi_end+=intron_offset
268
+ if !finding_start
269
+ # This is a middle exon
270
+ fragments << [cds_start,cds_end]
271
+ end
272
+ else
273
+ if finding_start
274
+ fragments << [p_i+1,(cds_end)]
275
+ next_coords = sorted_cds[i+1]
276
+ intron_offset = ((next_coords[0]-cds_end)-1)
277
+ p_i+=intron_offset
278
+ pi_end+=intron_offset
279
+ p_i = pi_end
280
+ finding_start=false
281
+ else # A terminal exon
282
+ # require 'debugger';debugger
283
+ fragments << [(cds_start),(p_i)]
284
+ break;
285
+ end
286
+ end
287
+ end
288
+ [fragments]
289
+ end
290
+
291
+ # gene_seq should already have been reverse_complemented if on reverse strand
292
+ def get_peptide_coordinates_from_transcript_info(prot_seq,pep_seq,protein_info,gene_seq)
293
+ if ( peptide_is_in_sixframe(pep_seq,gene_seq))
294
+ return nil
295
+ else
296
+
297
+ # puts "Found a gap #{protein_info.fasta_id}"
298
+ if protein_info.strand=='-'
299
+ pep_index = prot_seq.reverse.index(pep_seq.reverse)
300
+ if pep_index==nil
301
+ # require 'debugger';debugger
302
+ puts "Warning: Unable to find peptide #{pep_seq} in this protein! #{protein_info}"
303
+ return nil
304
+ end
305
+ pep_start_i = prot_seq.reverse.index(pep_seq.reverse)+1
306
+ # Plus 1 because on reverse stand stop-codon will be at the beginning of the sequence (when read forwards). Need to eliminate it.
307
+ else
308
+ pep_start_i = prot_seq.index(pep_seq)
309
+ if pep_start_i==nil
310
+ # require 'debugger';debugger
311
+ puts "Warning: Unable to find peptide #{pep_seq} in this protein! #{protein_info}"
312
+ return nil
313
+ end
314
+ end
315
+ pep_end_i = pep_start_i+pep_seq.length
316
+
317
+ return fragment_coords_from_protein_coords(pep_start_i,pep_end_i,protein_info.start,protein_info.end,protein_info.coding_sequences)
318
+ end
319
+ end
320
+
321
+ def get_peptide_coordinates_sixframe(prot_seq,pep_seq,protein_info)
322
+
323
+ if ( protein_info.strand == '-' )
324
+ prot_seq = prot_seq.reverse
325
+ pep_seq = pep_seq.reverse
326
+ end
327
+
328
+ start_indexes = [0]
329
+
330
+ prot_seq.scan /#{pep_seq}/ do |match|
331
+ start_indexes << prot_seq.index(match,start_indexes.last)
332
+ end
333
+ start_indexes.delete_at(0)
334
+
335
+ start_indexes.collect do |si|
336
+ pep_genomic_start = protein_info.start + 3*si
337
+ pep_genomic_end = pep_genomic_start + 3*pep_seq.length - 1
338
+ [[pep_genomic_start,pep_genomic_end]]
339
+ end
340
+
341
+ end
342
+
343
+ # Returns a 4-mer [genomic_start,fragment1_end(or0),frag2_start(or0),genomic_end]
344
+ def get_peptide_coordinates(prot_seq,pep_seq,protein_info,gene_seq)
345
+ if ( protein_info.is_sixframe)
346
+ return get_peptide_coordinates_sixframe(prot_seq,pep_seq,protein_info)
347
+ else
348
+ return get_peptide_coordinates_from_transcript_info(prot_seq,pep_seq,protein_info,gene_seq)
349
+ end
350
+ end
351
+
352
+
353
+ def generate_fragment_gffs_for_coords(coords,protein_info,pep_id,peptide_seq,genomedb,name="fragment")
354
+ scaff = get_fasta_record(protein_info.scaffold,genomedb)
355
+ scaffold_seq = Bio::Sequence::NA.new(scaff.seq)
356
+
357
+ fragment_phase = 0
358
+ ordered_coords= protein_info.strand=='+' ? coords : coords.reverse
359
+ if name=="CDS"
360
+ frag_id="#{pep_id}.fg"
361
+ else
362
+ frag_id="#{pep_id}.sp"
363
+ end
364
+ gff_lines = ordered_coords.collect do |frag_start,frag_end|
365
+ frag_naseq = scaffold_seq[frag_start-1..frag_end-1]
366
+
367
+ begin
368
+ frag_frame = fragment_phase+1
369
+ frag_seq = nil
370
+ if ( protein_info.strand=='-')
371
+ frag_seq = frag_naseq.reverse_complement.translate(frag_frame)
372
+ else
373
+ frag_seq = frag_naseq.translate(frag_frame)
374
+ end
375
+ rescue
376
+ if frag_naseq.length > 1
377
+ puts "Unable to translate #{frag_naseq}"
378
+ # require 'debugger'
379
+ end
380
+ frag_seq="*"
381
+ end
382
+
383
+ fragment_record=Bio::GFF::GFF3::Record.new(seqid = protein_info.scaffold,source="MSMS",
384
+ feature_type=name,start_position=frag_start,end_position=frag_end,score='',
385
+ strand=protein_info.strand,frame=fragment_phase,attributes=[["Parent",pep_id],["ID",frag_id],["Name",frag_seq]])
386
+
387
+
388
+ remainder=(frag_naseq.length-fragment_phase) % 3
389
+ fragment_phase=(3-remainder) % 3
390
+
391
+ fragment_record
392
+ end
393
+
394
+
395
+ concat_seq=nil
396
+
397
+ coords.each do |frag_start,frag_end|
398
+ frag_naseq = scaffold_seq[frag_start-1..frag_end-1]
399
+ concat_seq += frag_naseq unless concat_seq == nil
400
+ concat_seq = frag_naseq if concat_seq==nil
401
+ end
402
+
403
+ check_seq = protein_info.strand=='-' ? concat_seq.reverse_complement.translate : concat_seq.translate
404
+ if ( check_seq != peptide_seq)
405
+ require 'debugger';debugger
406
+ puts "Fragment seqs not equal to peptide seqs"
407
+ end
408
+
409
+ return gff_lines
410
+
411
+ end
412
+
413
+ def get_start_codon_coords_for_peptide(peptide_genomic_start,peptide_genomic_end,peptide_seq,protein_seq,strand)
414
+ pi=protein_seq.index(peptide_seq)
415
+ if ( protein_seq[pi]=='M' )
416
+ is_tryptic=false
417
+ if ( pi>0 && (protein_seq[pi-1]!='K' && protein_seq[pi-1]!='R') )
418
+ is_tryptic=true
419
+ elsif (pi==0)
420
+ is_tryptic=true
421
+ end
422
+ return nil unless is_tryptic
423
+
424
+ start_codon_coord = (strand=='+') ? peptide_genomic_start : peptide_genomic_end-1
425
+ # require 'debugger';debugger
426
+ return [start_codon_coord,start_codon_coord+2]
427
+ else
428
+ return nil
429
+ end
430
+ end
431
+
432
+ def get_cterm_coords_for_peptide(peptide_genomic_start,peptide_genomic_end,peptide_seq,protein_seq,strand)
433
+
434
+ if ( (peptide_seq[-1]!='K' && peptide_seq[-1]!='R' ) )
435
+
436
+ codon_coord = (strand=='+') ? peptide_genomic_end-3 : peptide_genomic_start+1
437
+ # require 'debugger';debugger
438
+ return [codon_coord,codon_coord+2]
439
+ else
440
+ return nil
441
+ end
442
+ end
443
+
444
+
445
+ def get_signal_peptide_for_peptide(peptide_seq,protein_seq)
446
+ pi=protein_seq.index(peptide_seq)
447
+ if ( pi>0 && (protein_seq[pi-1]!='K' && protein_seq[pi-1]!='R' && protein_seq[pi]!='M') )
448
+ reverse_leader_seq=protein_seq[0..pi].reverse
449
+ mi=reverse_leader_seq.index('M')
450
+
451
+ if ( mi==nil )
452
+ puts "No methionine found ahead of peptide sequence. Unable to determine signal peptide sequence"
453
+ return nil
454
+ end
455
+
456
+ mi=pi-mi
457
+
458
+ return protein_seq[mi..(pi-1)]
459
+ else
460
+ return nil
461
+ end
462
+ end
463
+
464
+ def generate_gff_for_peptide_mapped_to_protein(protein_seq,peptide_seq,protein_info,prot_id,peptide_prob,peptide_count,genomedb=nil)
465
+
466
+ dna_sequence=nil
467
+ if !protein_info.is_sixframe
468
+ throw "A genome is required if predicted transcripts are to be mapped" unless genomedb!=nil
469
+ dna_sequence = get_dna_sequence(protein_info,genomedb)
470
+ end
471
+
472
+ prot_seq = protein_seq
473
+ pep_seq = peptide_seq
474
+
475
+
476
+ peptide_coords = get_peptide_coordinates(prot_seq,pep_seq,protein_info,dna_sequence)
477
+
478
+ if ( peptide_coords==nil ) # Return value of nil means the entry is a predicted transcript that should already be covered by 6-frame
479
+ return []
480
+ end
481
+
482
+ gff_records=[]
483
+
484
+ # Now convert peptide coordinate to genome coordinates
485
+ # And create gff lines for each match
486
+ peptide_coords.each do |coords|
487
+
488
+ # require 'debugger';debugger
489
+ pep_genomic_start = coords.first[0]
490
+ pep_genomic_end = coords.last[1]
491
+
492
+ pep_id = "#{prot_id}.p#{peptide_count.to_s}"
493
+ pep_attributes = [["ID",pep_id],["Parent",prot_id],["Name",pep_seq]]
494
+
495
+ pep_gff_line = Bio::GFF::GFF3::Record.new(seqid = protein_info.scaffold,source="MSMS",
496
+ feature_type="peptide",start_position=pep_genomic_start,end_position=pep_genomic_end,score=peptide_prob,
497
+ strand=protein_info.strand,frame=nil,attributes=pep_attributes)
498
+
499
+ # For standard peptides
500
+ frag_gffs = generate_fragment_gffs_for_coords(coords,protein_info,pep_id,peptide_seq,genomedb,"CDS")
501
+ gff_records += [pep_gff_line] + frag_gffs
502
+ # require 'debugger';debugger
503
+ # For peptides with only 1 tryptic terminus
504
+ start_codon_coords=get_start_codon_coords_for_peptide(pep_genomic_start,pep_genomic_end,peptide_seq,protein_seq,protein_info.strand)
505
+ if start_codon_coords
506
+ start_codon_gff = Bio::GFF::GFF3::Record.new(seqid = protein_info.scaffold,source="MSMS",
507
+ feature_type="start_codon",start_position=start_codon_coords[0],end_position=start_codon_coords[1],score='',
508
+ strand=protein_info.strand,frame=nil,attributes=["Parent",pep_id])
509
+ gff_records+=[start_codon_gff]
510
+ end
511
+
512
+ cterm_coords = get_cterm_coords_for_peptide(pep_genomic_start,pep_genomic_end,peptide_seq,protein_seq,protein_info.strand)
513
+ if ( cterm_coords )
514
+ cterm_gff = Bio::GFF::GFF3::Record.new(seqid = protein_info.scaffold,source="MSMS",
515
+ feature_type="cterm",start_position=cterm_coords[0],end_position=cterm_coords[1],score='',
516
+ strand=protein_info.strand,frame=nil,attributes=["Parent",pep_id])
517
+ gff_records+=[start_codon_gff]
518
+ end
519
+
520
+ signal_peptide = get_signal_peptide_for_peptide(peptide_seq,protein_seq)
521
+ if signal_peptide
522
+ # require 'debugger';debugger
523
+
524
+ signal_peptide_coords=get_peptide_coordinates(prot_seq,signal_peptide,protein_info,dna_sequence)
525
+ if signal_peptide_coords
526
+ signal_peptide_coords.each do |spcoords|
527
+ signal_peptide_gff = generate_fragment_gffs_for_coords(spcoords,protein_info,pep_id,signal_peptide,genomedb,"signalpeptide")
528
+ gff_records += signal_peptide_gff
529
+ end
530
+ end
531
+ end
532
+
533
+
534
+ end
535
+ puts gff_records
536
+
537
+ gff_records
538
+ end
539
+
540
+ proteins = parse_proteins(tool.protxml)
541
+ fastadb = prepare_fasta(tool.database,'prot')
542
+ genomedb = nil
543
+ if tool.genome
544
+ genomedb = prepare_fasta(tool.genome,'nucl')
545
+ end
546
+
547
+ puts "Aligning peptides and writing GFF data..."
548
+
549
+ low_prob = 0
550
+ skipped = 0
551
+ peptide_count = 0
552
+ protein_count = 0
553
+ total_peptides = 0
554
+
555
+ for prot in proteins
556
+ prot_prob = prot['probability']
557
+ if ( prot_prob.to_f < tool.peptide_probability_threshold )
558
+ next
559
+ end
560
+
561
+ # Gets identifiers of all proteins (includeing indistinguishable ones)
562
+ prot_names=protein_names(prot)
563
+
564
+ peptides=peptide_nodes(prot)
565
+ entries_covered=[]
566
+ for protein_name in prot_names
567
+ protein_count += 1
568
+ prot_id = "pr#{protein_count.to_s}"
569
+ begin
570
+
571
+ protein_fasta_entry = get_fasta_record(protein_name,fastadb)
572
+ protein_info = cds_info_from_fasta(protein_fasta_entry)
573
+
574
+ if is_new_genome_location(protein_info,entries_covered)
575
+
576
+ protein_gff = generate_protein_gff(protein_name,protein_info,prot_prob,protein_count)
577
+
578
+ gff_db.records += ["##gff-version 3\n","##sequence-region #{protein_info.scaffold} 1 160\n",protein_gff]
579
+
580
+ prot_seq = protein_fasta_entry.aaseq.to_s
581
+ throw "Not amino_acids" if prot_seq != protein_fasta_entry.seq.to_s
582
+
583
+ peptide_count=1
584
+ for peptide in peptides
585
+ pprob = peptide['nsp_adjusted_probability'].to_f
586
+ if ( pprob >= tool.peptide_probability_threshold )
587
+ total_peptides += 1
588
+ pep_seq = peptide['peptide_sequence']
589
+
590
+ gff_db.records += generate_gff_for_peptide_mapped_to_protein(prot_seq,pep_seq,protein_info,prot_id,pprob,peptide_count,genomedb)
591
+ peptide_count+=1
592
+ end
593
+ end
594
+ else
595
+ puts "Skipping redundant entry #{protein_name}"
596
+ protein_count-=1 # To counter +1 prior to begin rescue end block
597
+ end
598
+
599
+ entries_covered<<protein_info
600
+
601
+ # puts protein_gff
602
+ # puts gff_db.records
603
+ rescue KeyError,EncodingError
604
+ skipped+=0
605
+ end
606
+
607
+ # exit
608
+ end
609
+
610
+ end
611
+
612
+ f = open(gff_out_file,'w+')
613
+ gff_db.records.each { |rec|
614
+ f.write(rec.to_s)
615
+ }
616
+ f.close
617
+
618
+ p "Finished."
619
+ p "Proteins: #{protein_count}"
620
+ p "Skipped Decoys: #{skipped}"
621
+ p "Total Peptides: #{total_peptides}"
622
+ p "Peptides Written: #{total_peptides - low_prob}"
623
+ p "Peptides Culled: #{low_prob}"
624
+ exit(0)