protk 1.2.4 → 1.2.5

Sign up to get free protection for your applications and to get access to all the features.
@@ -17,9 +17,10 @@ input_stager = nil
17
17
 
18
18
  # Setup specific command-line options for this tool. Other options are inherited from SearchTool
19
19
  #
20
- search_tool=SearchTool.new([:database,:explicit_output,:over_write,:enzyme,
20
+ search_tool=SearchTool.new([:background,:database,:explicit_output,:over_write,:enzyme,
21
21
  :modifications,:instrument,:mass_tolerance_units,:mass_tolerance,:missed_cleavages])
22
22
 
23
+ search_tool.jobid_prefix="p"
23
24
  search_tool.option_parser.banner = "Run an MSGFPlus msms search on a set of msms spectrum input files.\n\nUsage: msgfplus_search.rb [options] file1.mzML file2.mzML ..."
24
25
  search_tool.options.output_suffix="_msgfplus"
25
26
 
@@ -135,7 +136,7 @@ ARGV.each do |filename|
135
136
  if ( search_tool.explicit_output!=nil)
136
137
  output_path=search_tool.explicit_output
137
138
  else
138
- output_path="#{search_tool.output_base_path(filename.chomp)}.pepXML"
139
+ output_path="#{search_tool.output_base_path(filename.chomp)}.pep.xml"
139
140
  end
140
141
 
141
142
 
@@ -232,20 +233,28 @@ ARGV.each do |filename|
232
233
  # As a final part of the command we convert to pepxml
233
234
  if search_tool.no_pepxml
234
235
  cmd << "; cp #{mzid_output_path} #{output_path}"
235
- elsif search_tool.explicit_output
236
+ else
237
+ #if search_tool.explicit_output
236
238
  cmd << "; #{genv.idconvert} #{mzid_output_path} --pepXML -o #{Pathname.new(mzid_output_path).dirname}"
237
239
  #Then copy the pepxml to the final output path
238
- cmd << "; cp #{mzid_output_path.chomp('.mzid')}.pepXML #{output_path}"
240
+ cmd << "; mv #{mzid_output_path.chomp('.mzid')}.pepXML #{output_path}"
239
241
  end
240
242
 
241
243
 
242
244
  # Up to here we've formulated the command. The rest is cleanup
243
245
  p "Running:#{cmd}"
244
246
 
247
+ # In case the user specified background running we need to create a jobscript path
248
+ #
249
+ jobscript_path="#{output_path}.pbs.sh"
250
+
245
251
  # Run the search
246
252
  #
247
253
  job_params= {:jobid => search_tool.jobid_from_filename(filename) }
248
- search_tool.run(cmd,genv,job_params)
254
+ job_params[:queue]="seventytwo"
255
+ job_params[:vmem]="70gb"
256
+ code = search_tool.run(cmd,genv,job_params,jobscript_path)
257
+ throw "Command failed with exit code #{code}" unless code==0
249
258
 
250
259
  if for_galaxy
251
260
  input_stager.restore_references(output_path)
@@ -85,7 +85,12 @@ end
85
85
  prophet_tool.options.decoy_prefix="decoy"
86
86
  prophet_tool.option_parser.on( '--decoy-prefix prefix', 'Prefix for decoy sequences') do |prefix|
87
87
  prophet_tool.options.decoy_prefix = prefix
88
- end
88
+ end
89
+
90
+ prophet_tool.options.no_decoys = false
91
+ prophet_tool.option_parser.on( '--no-decoy', 'Don\'t use decoy sequences to pin down the negative distribution') do
92
+ prophet_tool.options.no_decoys = true
93
+ end
89
94
 
90
95
  prophet_tool.options.override_database=nil
91
96
  prophet_tool.option_parser.on( '--override-database database', 'Manually specify database') do |database|
@@ -207,12 +212,14 @@ def generate_command(genv,prophet_tool,inputs,output,database,engine)
207
212
  cmd << " -I2 -T3 -I4 -I5 -I6 -I7 "
208
213
  end
209
214
 
210
- if engine=="omssa" || engine=="phenyx"
211
- cmd << " -Op -P -d#{prophet_tool.decoy_prefix} "
212
- else
213
- cmd << " -d#{prophet_tool.decoy_prefix} "
214
- end
215
-
215
+ unless prophet_tool.no_decoys
216
+
217
+ if engine=="omssa" || engine=="phenyx"
218
+ cmd << " -Op -P -d#{prophet_tool.decoy_prefix} "
219
+ else
220
+ cmd << " -d#{prophet_tool.decoy_prefix} "
221
+ end
222
+ end
216
223
 
217
224
  if ( inputs.class==Array)
218
225
  cmd << " #{inputs.join(" ")}"
@@ -0,0 +1,624 @@
1
+ #!/usr/bin/env ruby
2
+ #
3
+ # This file is part of protk
4
+ # Original python version created by Max Grant
5
+ # Translated to ruby by Ira Cooke 29/1/2013
6
+ #
7
+ #
8
+
9
+ require 'protk/constants'
10
+ require 'protk/tool'
11
+ require 'protk/fastadb'
12
+ require 'protk/gapped_aligner'
13
+ require 'libxml'
14
+ require 'bio'
15
+
16
+ include LibXML
17
+
18
+ tool=Tool.new([:explicit_output])
19
+ tool.option_parser.banner = "Create a gff containing peptide Observations.\n\nUsage: protxml_to_gff.rb "
20
+
21
+
22
+ tool.options.protxml=nil
23
+ tool.option_parser.on( '-p filename','--protxml filename', 'Observed Data (ProtXML Format)' ) do |file|
24
+ tool.options.protxml=file
25
+ end
26
+
27
+ tool.options.database=nil
28
+ tool.option_parser.on( '-d filename','--database filename', 'Database used for ms/ms searches (Fasta Format)' ) do |file|
29
+ tool.options.database=file
30
+ end
31
+
32
+ tool.options.genome=nil
33
+ tool.option_parser.on( '-g filename','--genome filename', 'Nucleotide sequences for scaffolds (Fasta Format)' ) do |file|
34
+ tool.options.genome=file
35
+ end
36
+
37
+ tool.options.skip_fasta_indexing=false
38
+ tool.option_parser.on('--skip-index','Don\'t index database (Index should already exist)') do
39
+ tool.options.skip_fasta_indexing=true
40
+ end
41
+
42
+ tool.options.peptide_probability_threshold=0.95
43
+ tool.option_parser.on('--threshold prob','Peptide Probability Threshold (Default 0.95)') do |thresh|
44
+ tool.options.peptide_probability_threshold=thresh.to_f
45
+ end
46
+
47
+ exit unless tool.check_options [:protxml,:database]
48
+
49
+ gff_out_file="peptides.gff"
50
+ if ( tool.explicit_output != nil)
51
+ gff_out_file=tool.explicit_output
52
+ end
53
+
54
+ gff_db = Bio::GFF.new()
55
+ f = open(gff_out_file,'w+')
56
+
57
+
58
+ def parse_proteins(protxml_file)
59
+ puts "Parsing proteins from protxml"
60
+ protxml_parser=XML::Parser.file(protxml_file)
61
+ protxml_doc=protxml_parser.parse
62
+ proteins = protxml_doc.find('.//protxml:protein','protxml:http://regis-web.systemsbiology.net/protXML')
63
+ proteins
64
+ end
65
+
66
+ def prepare_fasta(database_path,type)
67
+ db_filename = nil
68
+ case
69
+ when Pathname.new(database_path).exist? # It's an explicitly named db
70
+ db_filename = Pathname.new(database_path).realpath.to_s
71
+ else
72
+ db_filename=Constants.new.current_database_for_name(database_path)
73
+ end
74
+
75
+ db_indexfilename = "#{db_filename}.pin"
76
+
77
+ if File.exist?(db_indexfilename)
78
+ puts "Using existing indexed database"
79
+ orf_lookup = FastaDB.new(db_filename)
80
+ else
81
+ puts "Indexing database"
82
+ orf_lookup = FastaDB.create(db_filename,db_filename,type)
83
+ end
84
+ orf_lookup
85
+ end
86
+
87
+ def protein_names(protein_node)
88
+ indis_proteins = protein_node.find('protxml:indistinguishable_protein','protxml:http://regis-web.systemsbiology.net/protXML')
89
+ prot_names = [protein_node['protein_name']]
90
+ for protein in indis_proteins
91
+ prot_names += [protein['protein_name']]
92
+ end
93
+ prot_names
94
+ end
95
+
96
+ def peptide_nodes(protein_node)
97
+ protein_node.find('protxml:peptide','protxml:http://regis-web.systemsbiology.net/protXML')
98
+ end
99
+
100
+
101
+ def get_fasta_record(protein_name,fastadb)
102
+ # puts "Looking up #{protein_name}"
103
+ entry = fastadb.get_by_id protein_name
104
+ if ( entry == nil)
105
+ puts "Failed lookup for #{protein_name}"
106
+ raise KeyError
107
+ end
108
+ entry
109
+ end
110
+
111
+ class CDSInfo
112
+ attr_accessor :fasta_id
113
+ attr_accessor :strand
114
+ attr_accessor :frame
115
+ attr_accessor :name
116
+ attr_accessor :scaffold
117
+ attr_accessor :start
118
+ attr_accessor :end
119
+ attr_accessor :coding_sequences
120
+ attr_accessor :is_sixframe
121
+ attr_accessor :gene_id
122
+
123
+ def overlap(candidate_entry)
124
+ return false if candidate_entry.scaffold!=self.scaffold
125
+ return false if strand!=self.strand
126
+ return false if candidate_entry.start >= self.end
127
+ return false if self.start <= candidate_entry.end
128
+ return true
129
+ end
130
+
131
+ end
132
+
133
+ def cds_info_from_fasta(fasta_entry)
134
+ info=CDSInfo.new
135
+ info.fasta_id=fasta_entry
136
+ positions = fasta_entry.identifiers.description.split(' ').collect { |coords| coords.split('|').collect {|pos| pos.to_i} }
137
+ info.coding_sequences=[]
138
+ info.gene_id
139
+ if ( positions.length < 1 )
140
+ raise EncodingError
141
+ elsif ( positions.length > 1)
142
+ info.coding_sequences = positions[1..-1]
143
+ end
144
+
145
+ info.start = positions[0][0]
146
+ info.end = positions[0][1]
147
+
148
+ info.scaffold=fasta_entry.entry_id.scan(/(scaffold_?\d+)_/)[0][0]
149
+ info.name = fasta_entry.entry_id.scan(/lcl\|(.*)/)[0][0]
150
+
151
+ if fasta_entry.entry_id =~ /frame/
152
+ info.frame=info.name.scan(/frame_(\d)/)[0][0]
153
+ info.strand = (info.frame.to_i > 3) ? '-' : '+'
154
+ info.is_sixframe = true
155
+ else
156
+ info.strand = (info.name =~ /rev/) ? '-' : '+'
157
+ info.gene_id=info.name.scan(/_\w{3}_(.*)\.t/)[0][0]
158
+ info.is_sixframe = false
159
+ end
160
+ info
161
+ end
162
+
163
+
164
+ def is_new_genome_location(candidate_entry,existing_entries)
165
+ # puts existing_entries
166
+ # require 'debugger';debugger
167
+
168
+ # genes=existing_entries.collect { |e| e.gene_id }.compact
169
+
170
+ # if genes.include?(candidate_entry.gene_id)
171
+ # return false
172
+ # end
173
+
174
+ existing_entries.each do |existing|
175
+ return false if existing.gene_id==candidate_entry.gene_id
176
+ return false if existing.overlap(candidate_entry)
177
+ end
178
+
179
+ return true
180
+ end
181
+
182
+
183
+ def generate_protein_gff(protein_name,entry_info,prot_prob,prot_id)
184
+ prot_qualifiers = {"source" => "MSMS", "score" => prot_prob, "ID" => prot_id}
185
+ prot_attributes = [["ID",prot_id],["Name",entry_info.name]]
186
+ prot_gff_line = Bio::GFF::GFF3::Record.new(seqid = entry_info.scaffold,source="MSMS",feature_type="protein",
187
+ start_position=entry_info.start,end_position=entry_info.end,score=prot_prob,strand=entry_info.strand,frame=nil,attributes=prot_attributes)
188
+ prot_gff_line
189
+ end
190
+
191
+ def get_dna_sequence(protein_info,genomedb)
192
+
193
+ scaffold_sequence = get_fasta_record(protein_info.scaffold,genomedb)
194
+ gene_sequence = scaffold_sequence.naseq.to_s[(protein_info.start-1)..protein_info.end]
195
+
196
+ if ( protein_info.strand == "-")
197
+ gene_sequence = Bio::Sequence::NA.new(gene_sequence).reverse_complement
198
+ end
199
+
200
+ gene_sequence
201
+ end
202
+
203
+ def peptide_is_in_sixframe(pep_seq,gene_seq)
204
+ gs=Bio::Sequence::NA.new(gene_seq)
205
+ (1..6).each do |frame|
206
+ if gs.translate(frame).index(pep_seq)
207
+ return true
208
+ end
209
+ end
210
+ return false
211
+ end
212
+
213
+ # gene_seq should already have been reverse_complemented if on reverse strand
214
+ def get_peptide_coordinates_by_alignment(prot_seq,pep_seq,protein_info,gene_seq)
215
+ if ( peptide_is_in_sixframe(pep_seq,gene_seq))
216
+ return nil
217
+ else
218
+ puts "Warning. Actually found a gap #{protein_info.fasta_id}"
219
+ aln=GappedAligner.new().align(pep_seq,gene_seq)
220
+ unless aln.gaps.length==1
221
+ puts "More than one intron.#{aln}"
222
+ require 'debugger';debugger
223
+ end
224
+ pep_coords = []
225
+ frags = aln.fragments
226
+ frags.reverse! if protein_info.strand=='-'
227
+
228
+ frags.each { |frag|
229
+ if protein_info.strand=='+'
230
+ frag_genomic_start = protein_info.start + frag[0]
231
+ frag_genomic_end = protein_info.start + frag[1]
232
+ else
233
+ frag_genomic_start = protein_info.end - frag[1]
234
+ frag_genomic_end = protein_info.end - frag[0]
235
+ end
236
+ pep_coords << frag_genomic_start
237
+ pep_coords << frag_genomic_end
238
+ }
239
+
240
+ return [pep_coords]
241
+ end
242
+ end
243
+
244
+ def fragment_coords_from_protein_coords(pepstart,pepend,gene_start,gene_end,coding_sequences)
245
+
246
+ sorted_cds = coding_sequences.sort { |a, b| a[0] <=> b[0] }
247
+
248
+ # Assume positive strand
249
+ pi_start=pepstart*3+gene_start-1
250
+ pi_end=pepend*3+gene_start-1
251
+
252
+ fragments=[]
253
+ p_i = pi_start #Initially we are looking for the first fragment
254
+ finding_start=true
255
+
256
+ sorted_cds.each_with_index do |cds_coords, i|
257
+ cds_start=cds_coords[0]
258
+ cds_end = cds_coords[1]
259
+ if cds_end < p_i # Exon is before index in sequence and doesn't contain p_i
260
+ if sorted_cds.length <= i+1
261
+ require 'debugger';debugger
262
+ end
263
+
264
+ next_coords = sorted_cds[i+1]
265
+ intron_offset = ((next_coords[0]-cds_end)-1)
266
+ p_i+=intron_offset
267
+ pi_end+=intron_offset
268
+ if !finding_start
269
+ # This is a middle exon
270
+ fragments << [cds_start,cds_end]
271
+ end
272
+ else
273
+ if finding_start
274
+ fragments << [p_i+1,(cds_end)]
275
+ next_coords = sorted_cds[i+1]
276
+ intron_offset = ((next_coords[0]-cds_end)-1)
277
+ p_i+=intron_offset
278
+ pi_end+=intron_offset
279
+ p_i = pi_end
280
+ finding_start=false
281
+ else # A terminal exon
282
+ # require 'debugger';debugger
283
+ fragments << [(cds_start),(p_i)]
284
+ break;
285
+ end
286
+ end
287
+ end
288
+ [fragments]
289
+ end
290
+
291
+ # gene_seq should already have been reverse_complemented if on reverse strand
292
+ def get_peptide_coordinates_from_transcript_info(prot_seq,pep_seq,protein_info,gene_seq)
293
+ if ( peptide_is_in_sixframe(pep_seq,gene_seq))
294
+ return nil
295
+ else
296
+
297
+ # puts "Found a gap #{protein_info.fasta_id}"
298
+ if protein_info.strand=='-'
299
+ pep_index = prot_seq.reverse.index(pep_seq.reverse)
300
+ if pep_index==nil
301
+ # require 'debugger';debugger
302
+ puts "Warning: Unable to find peptide #{pep_seq} in this protein! #{protein_info}"
303
+ return nil
304
+ end
305
+ pep_start_i = prot_seq.reverse.index(pep_seq.reverse)+1
306
+ # Plus 1 because on reverse stand stop-codon will be at the beginning of the sequence (when read forwards). Need to eliminate it.
307
+ else
308
+ pep_start_i = prot_seq.index(pep_seq)
309
+ if pep_start_i==nil
310
+ # require 'debugger';debugger
311
+ puts "Warning: Unable to find peptide #{pep_seq} in this protein! #{protein_info}"
312
+ return nil
313
+ end
314
+ end
315
+ pep_end_i = pep_start_i+pep_seq.length
316
+
317
+ return fragment_coords_from_protein_coords(pep_start_i,pep_end_i,protein_info.start,protein_info.end,protein_info.coding_sequences)
318
+ end
319
+ end
320
+
321
+ def get_peptide_coordinates_sixframe(prot_seq,pep_seq,protein_info)
322
+
323
+ if ( protein_info.strand == '-' )
324
+ prot_seq = prot_seq.reverse
325
+ pep_seq = pep_seq.reverse
326
+ end
327
+
328
+ start_indexes = [0]
329
+
330
+ prot_seq.scan /#{pep_seq}/ do |match|
331
+ start_indexes << prot_seq.index(match,start_indexes.last)
332
+ end
333
+ start_indexes.delete_at(0)
334
+
335
+ start_indexes.collect do |si|
336
+ pep_genomic_start = protein_info.start + 3*si
337
+ pep_genomic_end = pep_genomic_start + 3*pep_seq.length - 1
338
+ [[pep_genomic_start,pep_genomic_end]]
339
+ end
340
+
341
+ end
342
+
343
+ # Returns a 4-mer [genomic_start,fragment1_end(or0),frag2_start(or0),genomic_end]
344
+ def get_peptide_coordinates(prot_seq,pep_seq,protein_info,gene_seq)
345
+ if ( protein_info.is_sixframe)
346
+ return get_peptide_coordinates_sixframe(prot_seq,pep_seq,protein_info)
347
+ else
348
+ return get_peptide_coordinates_from_transcript_info(prot_seq,pep_seq,protein_info,gene_seq)
349
+ end
350
+ end
351
+
352
+
353
+ def generate_fragment_gffs_for_coords(coords,protein_info,pep_id,peptide_seq,genomedb,name="fragment")
354
+ scaff = get_fasta_record(protein_info.scaffold,genomedb)
355
+ scaffold_seq = Bio::Sequence::NA.new(scaff.seq)
356
+
357
+ fragment_phase = 0
358
+ ordered_coords= protein_info.strand=='+' ? coords : coords.reverse
359
+ if name=="CDS"
360
+ frag_id="#{pep_id}.fg"
361
+ else
362
+ frag_id="#{pep_id}.sp"
363
+ end
364
+ gff_lines = ordered_coords.collect do |frag_start,frag_end|
365
+ frag_naseq = scaffold_seq[frag_start-1..frag_end-1]
366
+
367
+ begin
368
+ frag_frame = fragment_phase+1
369
+ frag_seq = nil
370
+ if ( protein_info.strand=='-')
371
+ frag_seq = frag_naseq.reverse_complement.translate(frag_frame)
372
+ else
373
+ frag_seq = frag_naseq.translate(frag_frame)
374
+ end
375
+ rescue
376
+ if frag_naseq.length > 1
377
+ puts "Unable to translate #{frag_naseq}"
378
+ # require 'debugger'
379
+ end
380
+ frag_seq="*"
381
+ end
382
+
383
+ fragment_record=Bio::GFF::GFF3::Record.new(seqid = protein_info.scaffold,source="MSMS",
384
+ feature_type=name,start_position=frag_start,end_position=frag_end,score='',
385
+ strand=protein_info.strand,frame=fragment_phase,attributes=[["Parent",pep_id],["ID",frag_id],["Name",frag_seq]])
386
+
387
+
388
+ remainder=(frag_naseq.length-fragment_phase) % 3
389
+ fragment_phase=(3-remainder) % 3
390
+
391
+ fragment_record
392
+ end
393
+
394
+
395
+ concat_seq=nil
396
+
397
+ coords.each do |frag_start,frag_end|
398
+ frag_naseq = scaffold_seq[frag_start-1..frag_end-1]
399
+ concat_seq += frag_naseq unless concat_seq == nil
400
+ concat_seq = frag_naseq if concat_seq==nil
401
+ end
402
+
403
+ check_seq = protein_info.strand=='-' ? concat_seq.reverse_complement.translate : concat_seq.translate
404
+ if ( check_seq != peptide_seq)
405
+ require 'debugger';debugger
406
+ puts "Fragment seqs not equal to peptide seqs"
407
+ end
408
+
409
+ return gff_lines
410
+
411
+ end
412
+
413
+ def get_start_codon_coords_for_peptide(peptide_genomic_start,peptide_genomic_end,peptide_seq,protein_seq,strand)
414
+ pi=protein_seq.index(peptide_seq)
415
+ if ( protein_seq[pi]=='M' )
416
+ is_tryptic=false
417
+ if ( pi>0 && (protein_seq[pi-1]!='K' && protein_seq[pi-1]!='R') )
418
+ is_tryptic=true
419
+ elsif (pi==0)
420
+ is_tryptic=true
421
+ end
422
+ return nil unless is_tryptic
423
+
424
+ start_codon_coord = (strand=='+') ? peptide_genomic_start : peptide_genomic_end-1
425
+ # require 'debugger';debugger
426
+ return [start_codon_coord,start_codon_coord+2]
427
+ else
428
+ return nil
429
+ end
430
+ end
431
+
432
+ def get_cterm_coords_for_peptide(peptide_genomic_start,peptide_genomic_end,peptide_seq,protein_seq,strand)
433
+
434
+ if ( (peptide_seq[-1]!='K' && peptide_seq[-1]!='R' ) )
435
+
436
+ codon_coord = (strand=='+') ? peptide_genomic_end-3 : peptide_genomic_start+1
437
+ # require 'debugger';debugger
438
+ return [codon_coord,codon_coord+2]
439
+ else
440
+ return nil
441
+ end
442
+ end
443
+
444
+
445
+ def get_signal_peptide_for_peptide(peptide_seq,protein_seq)
446
+ pi=protein_seq.index(peptide_seq)
447
+ if ( pi>0 && (protein_seq[pi-1]!='K' && protein_seq[pi-1]!='R' && protein_seq[pi]!='M') )
448
+ reverse_leader_seq=protein_seq[0..pi].reverse
449
+ mi=reverse_leader_seq.index('M')
450
+
451
+ if ( mi==nil )
452
+ puts "No methionine found ahead of peptide sequence. Unable to determine signal peptide sequence"
453
+ return nil
454
+ end
455
+
456
+ mi=pi-mi
457
+
458
+ return protein_seq[mi..(pi-1)]
459
+ else
460
+ return nil
461
+ end
462
+ end
463
+
464
+ def generate_gff_for_peptide_mapped_to_protein(protein_seq,peptide_seq,protein_info,prot_id,peptide_prob,peptide_count,genomedb=nil)
465
+
466
+ dna_sequence=nil
467
+ if !protein_info.is_sixframe
468
+ throw "A genome is required if predicted transcripts are to be mapped" unless genomedb!=nil
469
+ dna_sequence = get_dna_sequence(protein_info,genomedb)
470
+ end
471
+
472
+ prot_seq = protein_seq
473
+ pep_seq = peptide_seq
474
+
475
+
476
+ peptide_coords = get_peptide_coordinates(prot_seq,pep_seq,protein_info,dna_sequence)
477
+
478
+ if ( peptide_coords==nil ) # Return value of nil means the entry is a predicted transcript that should already be covered by 6-frame
479
+ return []
480
+ end
481
+
482
+ gff_records=[]
483
+
484
+ # Now convert peptide coordinate to genome coordinates
485
+ # And create gff lines for each match
486
+ peptide_coords.each do |coords|
487
+
488
+ # require 'debugger';debugger
489
+ pep_genomic_start = coords.first[0]
490
+ pep_genomic_end = coords.last[1]
491
+
492
+ pep_id = "#{prot_id}.p#{peptide_count.to_s}"
493
+ pep_attributes = [["ID",pep_id],["Parent",prot_id],["Name",pep_seq]]
494
+
495
+ pep_gff_line = Bio::GFF::GFF3::Record.new(seqid = protein_info.scaffold,source="MSMS",
496
+ feature_type="peptide",start_position=pep_genomic_start,end_position=pep_genomic_end,score=peptide_prob,
497
+ strand=protein_info.strand,frame=nil,attributes=pep_attributes)
498
+
499
+ # For standard peptides
500
+ frag_gffs = generate_fragment_gffs_for_coords(coords,protein_info,pep_id,peptide_seq,genomedb,"CDS")
501
+ gff_records += [pep_gff_line] + frag_gffs
502
+ # require 'debugger';debugger
503
+ # For peptides with only 1 tryptic terminus
504
+ start_codon_coords=get_start_codon_coords_for_peptide(pep_genomic_start,pep_genomic_end,peptide_seq,protein_seq,protein_info.strand)
505
+ if start_codon_coords
506
+ start_codon_gff = Bio::GFF::GFF3::Record.new(seqid = protein_info.scaffold,source="MSMS",
507
+ feature_type="start_codon",start_position=start_codon_coords[0],end_position=start_codon_coords[1],score='',
508
+ strand=protein_info.strand,frame=nil,attributes=["Parent",pep_id])
509
+ gff_records+=[start_codon_gff]
510
+ end
511
+
512
+ cterm_coords = get_cterm_coords_for_peptide(pep_genomic_start,pep_genomic_end,peptide_seq,protein_seq,protein_info.strand)
513
+ if ( cterm_coords )
514
+ cterm_gff = Bio::GFF::GFF3::Record.new(seqid = protein_info.scaffold,source="MSMS",
515
+ feature_type="cterm",start_position=cterm_coords[0],end_position=cterm_coords[1],score='',
516
+ strand=protein_info.strand,frame=nil,attributes=["Parent",pep_id])
517
+ gff_records+=[start_codon_gff]
518
+ end
519
+
520
+ signal_peptide = get_signal_peptide_for_peptide(peptide_seq,protein_seq)
521
+ if signal_peptide
522
+ # require 'debugger';debugger
523
+
524
+ signal_peptide_coords=get_peptide_coordinates(prot_seq,signal_peptide,protein_info,dna_sequence)
525
+ if signal_peptide_coords
526
+ signal_peptide_coords.each do |spcoords|
527
+ signal_peptide_gff = generate_fragment_gffs_for_coords(spcoords,protein_info,pep_id,signal_peptide,genomedb,"signalpeptide")
528
+ gff_records += signal_peptide_gff
529
+ end
530
+ end
531
+ end
532
+
533
+
534
+ end
535
+ puts gff_records
536
+
537
+ gff_records
538
+ end
539
+
540
+ proteins = parse_proteins(tool.protxml)
541
+ fastadb = prepare_fasta(tool.database,'prot')
542
+ genomedb = nil
543
+ if tool.genome
544
+ genomedb = prepare_fasta(tool.genome,'nucl')
545
+ end
546
+
547
+ puts "Aligning peptides and writing GFF data..."
548
+
549
+ low_prob = 0
550
+ skipped = 0
551
+ peptide_count = 0
552
+ protein_count = 0
553
+ total_peptides = 0
554
+
555
+ for prot in proteins
556
+ prot_prob = prot['probability']
557
+ if ( prot_prob.to_f < tool.peptide_probability_threshold )
558
+ next
559
+ end
560
+
561
+ # Gets identifiers of all proteins (includeing indistinguishable ones)
562
+ prot_names=protein_names(prot)
563
+
564
+ peptides=peptide_nodes(prot)
565
+ entries_covered=[]
566
+ for protein_name in prot_names
567
+ protein_count += 1
568
+ prot_id = "pr#{protein_count.to_s}"
569
+ begin
570
+
571
+ protein_fasta_entry = get_fasta_record(protein_name,fastadb)
572
+ protein_info = cds_info_from_fasta(protein_fasta_entry)
573
+
574
+ if is_new_genome_location(protein_info,entries_covered)
575
+
576
+ protein_gff = generate_protein_gff(protein_name,protein_info,prot_prob,protein_count)
577
+
578
+ gff_db.records += ["##gff-version 3\n","##sequence-region #{protein_info.scaffold} 1 160\n",protein_gff]
579
+
580
+ prot_seq = protein_fasta_entry.aaseq.to_s
581
+ throw "Not amino_acids" if prot_seq != protein_fasta_entry.seq.to_s
582
+
583
+ peptide_count=1
584
+ for peptide in peptides
585
+ pprob = peptide['nsp_adjusted_probability'].to_f
586
+ if ( pprob >= tool.peptide_probability_threshold )
587
+ total_peptides += 1
588
+ pep_seq = peptide['peptide_sequence']
589
+
590
+ gff_db.records += generate_gff_for_peptide_mapped_to_protein(prot_seq,pep_seq,protein_info,prot_id,pprob,peptide_count,genomedb)
591
+ peptide_count+=1
592
+ end
593
+ end
594
+ else
595
+ puts "Skipping redundant entry #{protein_name}"
596
+ protein_count-=1 # To counter +1 prior to begin rescue end block
597
+ end
598
+
599
+ entries_covered<<protein_info
600
+
601
+ # puts protein_gff
602
+ # puts gff_db.records
603
+ rescue KeyError,EncodingError
604
+ skipped+=0
605
+ end
606
+
607
+ # exit
608
+ end
609
+
610
+ end
611
+
612
+ f = open(gff_out_file,'w+')
613
+ gff_db.records.each { |rec|
614
+ f.write(rec.to_s)
615
+ }
616
+ f.close
617
+
618
+ p "Finished."
619
+ p "Proteins: #{protein_count}"
620
+ p "Skipped Decoys: #{skipped}"
621
+ p "Total Peptides: #{total_peptides}"
622
+ p "Peptides Written: #{total_peptides - low_prob}"
623
+ p "Peptides Culled: #{low_prob}"
624
+ exit(0)