protk 1.2.6.pre5 → 1.3.0.pre1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (57) hide show
  1. checksums.yaml +4 -4
  2. data/README.md +84 -45
  3. data/bin/add_retention_times.rb +9 -5
  4. data/bin/augustus_to_proteindb.rb +7 -11
  5. data/bin/interprophet.rb +28 -46
  6. data/bin/make_decoy.rb +16 -48
  7. data/bin/mascot_search.rb +57 -71
  8. data/bin/mascot_to_pepxml.rb +13 -26
  9. data/bin/msgfplus_search.rb +70 -107
  10. data/bin/omssa_search.rb +52 -109
  11. data/bin/peptide_prophet.rb +44 -119
  12. data/bin/pepxml_to_table.rb +24 -27
  13. data/bin/protein_prophet.rb +22 -82
  14. data/bin/protxml_to_gff.rb +22 -519
  15. data/bin/protxml_to_table.rb +2 -16
  16. data/bin/sixframe.rb +10 -32
  17. data/bin/tandem_search.rb +30 -403
  18. data/bin/tandem_to_pepxml.rb +43 -0
  19. data/bin/unimod_to_loc.rb +1 -1
  20. data/ext/{protk/decoymaker → decoymaker}/decoymaker.c +74 -21
  21. data/ext/decoymaker/extconf.rb +3 -0
  22. data/lib/protk/constants.rb +16 -2
  23. data/lib/protk/data/default_config.yml +2 -1
  24. data/lib/protk/data/tandem_gpm_defaults.xml +175 -0
  25. data/lib/protk/data/tandem_isb_kscore_defaults.xml +123 -0
  26. data/lib/protk/data/tandem_isb_native_defaults.xml +123 -0
  27. data/lib/protk/data/tandem_params.xml +17 -54
  28. data/lib/protk/fastadb.rb +2 -2
  29. data/lib/protk/prophet_tool.rb +1 -1
  30. data/lib/protk/protxml_to_gff_tool.rb +474 -0
  31. data/lib/protk/search_tool.rb +58 -103
  32. data/lib/protk/setup_rakefile.rake +9 -5
  33. data/lib/protk/tandem_search_tool.rb +256 -0
  34. data/lib/protk/tool.rb +85 -104
  35. data/lib/protk.rb +1 -6
  36. metadata +24 -103
  37. data/bin/annotate_ids.rb +0 -59
  38. data/bin/asapratio.rb +0 -27
  39. data/bin/blastxml_to_table.rb +0 -119
  40. data/bin/correct_omssa_retention_times.rb +0 -27
  41. data/bin/feature_finder.rb +0 -95
  42. data/bin/file_convert.rb +0 -164
  43. data/bin/generate_omssa_loc.rb +0 -42
  44. data/bin/gffmerge.rb +0 -208
  45. data/bin/libra.rb +0 -70
  46. data/bin/toppas_pipeline.rb +0 -84
  47. data/bin/uniprot_annotation.rb +0 -141
  48. data/bin/xls_to_table.rb +0 -52
  49. data/bin/xpress.rb +0 -27
  50. data/ext/protk/decoymaker/extconf.rb +0 -3
  51. data/ext/protk/simplealign/extconf.rb +0 -3
  52. data/lib/protk/biotools_excel_converter.rb +0 -60
  53. data/lib/protk/eupathdb_gene_information_table.rb +0 -158
  54. data/lib/protk/gapped_aligner.rb +0 -264
  55. data/lib/protk/protein_annotator.rb +0 -646
  56. data/lib/protk/spreadsheet_extensions.rb +0 -79
  57. data/lib/protk/xtandem_defaults.rb +0 -11
@@ -7,73 +7,26 @@
7
7
  #
8
8
 
9
9
  require 'protk/constants'
10
- require 'protk/tool'
10
+ require 'protk/protxml_to_gff_tool'
11
11
  require 'protk/fastadb'
12
- require 'protk/gapped_aligner'
13
12
  require 'libxml'
14
13
  require 'bio'
15
14
 
16
15
  include LibXML
17
16
 
18
- tool=Tool.new([:explicit_output])
19
- tool.option_parser.banner = "Create a gff containing peptide Observations.\n\nUsage: protxml_to_gff.rb "
17
+ tool=ProtXMLToGFFTool.new()
20
18
 
19
+ @output_extension=".gff"
20
+ @output_suffix=""
21
21
 
22
- tool.options.protxml=nil
23
- tool.option_parser.on( '-p filename','--protxml filename', 'Observed Data (ProtXML Format)' ) do |file|
24
- tool.options.protxml=file
25
- end
26
-
27
- tool.options.database=nil
28
- tool.option_parser.on( '-d filename','--database filename', 'Database used for ms/ms searches (Fasta Format)' ) do |file|
29
- tool.options.database=file
30
- end
22
+ exit unless tool.check_options(true,[:database])
31
23
 
32
- tool.options.protein_find=nil
33
- tool.option_parser.on( '-f term','--find term', 'Restrict output to proteins whose name matches the specified string' ) do |term|
34
- tool.options.protein_find=term
35
- end
36
-
37
- tool.options.nterm_minlen=7
38
- tool.option_parser.on( '-n len','--nterm-min-len len', 'Only include inferred N-terminal sequences if longer than len' ) do |len|
39
- tool.options.nterm_minlen=len
40
- end
24
+ input_proxml=ARGV[0]
41
25
 
42
- tool.options.genome=nil
43
- tool.option_parser.on( '-g filename','--genome filename', 'Nucleotide sequences for scaffolds (Fasta Format)' ) do |file|
44
- tool.options.genome=file
45
- end
46
-
47
- tool.options.skip_fasta_indexing=false
48
- tool.option_parser.on('--skip-index','Don\'t index database (Index should already exist)') do
49
- tool.options.skip_fasta_indexing=true
50
- end
51
-
52
- tool.options.stack_charge_states=false
53
- tool.option_parser.on('--stack-charge-states','Different peptide charge states get separate gff entries') do
54
- tool.options.stack_charge_states=true
55
- end
56
-
57
- tool.options.collapse_redundant_proteins=false
58
- tool.option_parser.on('--collapse-redundant-proteins','Proteins that cover genomic regions already covered will be skipped') do
59
- tool.options.collapse_redundant_proteins=true
60
- end
61
-
62
- tool.options.peptide_probability_threshold=0.95
63
- tool.option_parser.on('--threshold prob','Peptide Probability Threshold (Default 0.95)') do |thresh|
64
- tool.options.peptide_probability_threshold=thresh.to_f
65
- end
66
-
67
- tool.options.protein_probability_threshold=0.99
68
- tool.option_parser.on('--prot-threshold prob','Protein Probability Threshold (Default 0.99)') do |thresh|
69
- tool.options.protein_probability_threshold=thresh.to_f
70
- end
71
-
72
- exit unless tool.check_options [:protxml,:database]
73
-
74
- gff_out_file="peptides.gff"
75
- if ( tool.explicit_output != nil)
76
- gff_out_file=tool.explicit_output
26
+ if ( tool.explicit_output!=nil)
27
+ gff_out_file=tool.explicit_output
28
+ else
29
+ gff_out_file=Tool.default_output_path(input_proxml,@output_extension,tool.output_prefix,@output_suffix)
77
30
  end
78
31
 
79
32
  gff_db = Bio::GFF.new()
@@ -92,7 +45,7 @@ def prepare_fasta(database_path,type)
92
45
  db_filename = nil
93
46
  case
94
47
  when Pathname.new(database_path).exist? # It's an explicitly named db
95
- db_filename = Pathname.new(database_path).realpath.to_s
48
+ db_filename = Pathname.new(database_path).expand_path.to_s
96
49
  else
97
50
  db_filename=Constants.new.current_database_for_name(database_path)
98
51
  end
@@ -109,457 +62,7 @@ def prepare_fasta(database_path,type)
109
62
  orf_lookup
110
63
  end
111
64
 
112
- def protein_names(protein_node)
113
- indis_proteins = protein_node.find('protxml:indistinguishable_protein','protxml:http://regis-web.systemsbiology.net/protXML')
114
- prot_names = [protein_node['protein_name']]
115
- for protein in indis_proteins
116
- prot_names += [protein['protein_name']]
117
- end
118
- prot_names
119
- end
120
-
121
- def peptide_nodes(protein_node)
122
- return protein_node.find('protxml:peptide','protxml:http://regis-web.systemsbiology.net/protXML')
123
- end
124
-
125
-
126
- def get_fasta_record(protein_name,fastadb)
127
- # puts "Looking up #{protein_name}"
128
- entry = fastadb.get_by_id protein_name
129
- if ( entry == nil)
130
- puts "Failed lookup for #{protein_name}"
131
- raise KeyError
132
- end
133
- entry
134
- end
135
-
136
- class CDSInfo
137
- attr_accessor :fasta_id
138
- attr_accessor :strand
139
- attr_accessor :frame
140
- attr_accessor :name
141
- attr_accessor :scaffold
142
- attr_accessor :start
143
- attr_accessor :end
144
- attr_accessor :coding_sequences
145
- attr_accessor :is_sixframe
146
- attr_accessor :gene_id
147
-
148
- def overlap(candidate_entry)
149
- return false if candidate_entry.scaffold!=self.scaffold
150
- return false if strand!=self.strand
151
- return false if candidate_entry.start >= self.end
152
- return false if self.start <= candidate_entry.end
153
- return true
154
- end
155
-
156
- end
157
-
158
- def cds_info_from_fasta(fasta_entry)
159
- info=CDSInfo.new
160
- info.fasta_id=fasta_entry
161
- positions = fasta_entry.identifiers.description.split(' ').collect { |coords| coords.split('|').collect {|pos| pos.to_i} }
162
- info.coding_sequences=[]
163
- info.gene_id
164
- if ( positions.length < 1 )
165
- raise EncodingError
166
- elsif ( positions.length > 1)
167
- info.coding_sequences = positions[1..-1]
168
- end
169
-
170
- info.start = positions[0][0]
171
- info.end = positions[0][1]
172
-
173
- info.scaffold=fasta_entry.entry_id.scan(/(scaffold_?\d+)_/)[0][0]
174
- info.name = fasta_entry.entry_id.scan(/lcl\|(.*)/)[0][0]
175
-
176
- if fasta_entry.entry_id =~ /frame/
177
- info.frame=info.name.scan(/frame_(\d)/)[0][0]
178
- info.strand = (info.frame.to_i > 3) ? '-' : '+'
179
- info.is_sixframe = true
180
- else
181
- info.strand = (info.name =~ /rev/) ? '-' : '+'
182
- info.gene_id=info.name.scan(/_\w{3}_(.*)\.t/)[0][0]
183
- info.is_sixframe = false
184
- end
185
- info
186
- end
187
-
188
-
189
- def is_new_genome_location(candidate_entry,existing_entries)
190
- # puts existing_entries
191
- # require 'debugger';debugger
192
-
193
- # genes=existing_entries.collect { |e| e.gene_id }.compact
194
-
195
- # if genes.include?(candidate_entry.gene_id)
196
- # return false
197
- # end
198
-
199
- existing_entries.each do |existing|
200
- return false if existing.gene_id==candidate_entry.gene_id
201
- return false if existing.overlap(candidate_entry)
202
- end
203
-
204
- return true
205
- end
206
-
207
-
208
- def generate_protein_gff(protein_name,entry_info,prot_prob,prot_id)
209
- prot_qualifiers = {"source" => "MSMS", "score" => prot_prob, "ID" => prot_id}
210
- prot_attributes = [["ID",prot_id],["Name",entry_info.name]]
211
- prot_gff_line = Bio::GFF::GFF3::Record.new(seqid = entry_info.scaffold,source="MSMS",feature_type="protein",
212
- start_position=entry_info.start,end_position=entry_info.end,score=prot_prob,strand=entry_info.strand,frame=nil,attributes=prot_attributes)
213
- prot_gff_line
214
- end
215
-
216
- def get_dna_sequence(protein_info,genomedb)
217
-
218
- scaffold_sequence = get_fasta_record(protein_info.scaffold,genomedb)
219
- gene_sequence = scaffold_sequence.naseq.to_s[(protein_info.start-1)..protein_info.end]
220
-
221
- if ( protein_info.strand == "-")
222
- gene_sequence = Bio::Sequence::NA.new(gene_sequence).reverse_complement
223
- end
224
-
225
- gene_sequence
226
- end
227
-
228
- def peptide_is_in_sixframe(pep_seq,gene_seq)
229
- gs=Bio::Sequence::NA.new(gene_seq)
230
- (1..6).each do |frame|
231
- if gs.translate(frame).index(pep_seq)
232
- return true
233
- end
234
- end
235
- return false
236
- end
237
-
238
- def fragment_coords_from_protein_coords(pepstart,pepend,gene_start,gene_end,coding_sequences)
239
-
240
- sorted_cds = coding_sequences.sort { |a, b| a[0] <=> b[0] }
241
-
242
-
243
- # Assume positive strand
244
- pi_start=pepstart*3+gene_start-1
245
- pi_end=pepend*3+gene_start-1
246
-
247
- fragments=[]
248
- p_i = pi_start #Initially we are looking for the first fragment
249
- finding_start=true
250
-
251
- sorted_cds.each_with_index do |cds_coords, i|
252
- cds_start=cds_coords[0]
253
- cds_end = cds_coords[1]
254
- if cds_end < p_i # Exon is before index in sequence and doesn't contain p_i
255
- if sorted_cds.length <= i+1
256
- require 'debugger';debugger
257
- end
258
-
259
- next_coords = sorted_cds[i+1]
260
- intron_offset = ((next_coords[0]-cds_end)-1)
261
- p_i+=intron_offset
262
- pi_end+=intron_offset
263
- if !finding_start
264
- # This is a middle exon
265
- fragments << [cds_start,cds_end]
266
- end
267
- else
268
- if finding_start
269
-
270
- if ( pi_end <= cds_end) #Whole peptide contained in a single exon
271
- fragments << [p_i+1,pi_end]
272
- break;
273
- end
274
-
275
-
276
- fragments << [p_i+1,(cds_end)]
277
- next_coords = sorted_cds[i+1]
278
- intron_offset = ((next_coords[0]-cds_end)-1)
279
- p_i+=intron_offset
280
- pi_end+=intron_offset
281
- p_i = pi_end
282
- finding_start=false
283
- else # A terminal exon
284
- # require 'debugger';debugger
285
- fragments << [(cds_start),(p_i)]
286
- break;
287
- end
288
- end
289
- end
290
- [fragments]
291
- end
292
-
293
- # gene_seq should already have been reverse_complemented if on reverse strand
294
- def get_peptide_coordinates_from_transcript_info(prot_seq,pep_seq,protein_info,gene_seq)
295
- # if ( peptide_is_in_sixframe(pep_seq,gene_seq))
296
- # Peptide is in 6-frame but on a predicted transcript
297
- # return nil
298
- # else
299
-
300
- # puts "Found a gap #{protein_info.fasta_id}"
301
- if protein_info.strand=='-'
302
- pep_index = prot_seq.reverse.index(pep_seq.reverse)
303
- if pep_index==nil
304
- # require 'debugger';debugger
305
- puts "Warning: Unable to find peptide #{pep_seq} in this protein! #{protein_info}"
306
- return nil
307
- end
308
- pep_start_i = prot_seq.reverse.index(pep_seq.reverse)+1
309
- # Plus 1 because on reverse stand stop-codon will be at the beginning of the sequence (when read forwards). Need to eliminate it.
310
- else
311
- pep_start_i = prot_seq.index(pep_seq)
312
- if pep_start_i==nil
313
- # require 'debugger';debugger
314
- puts "Warning: Unable to find peptide #{pep_seq} in this protein! #{protein_info}"
315
- return nil
316
- end
317
- end
318
- pep_end_i = pep_start_i+pep_seq.length
319
-
320
- return fragment_coords_from_protein_coords(pep_start_i,pep_end_i,protein_info.start,protein_info.end,protein_info.coding_sequences)
321
- # end
322
- end
323
-
324
- def get_peptide_coordinates_sixframe(prot_seq,pep_seq,protein_info)
325
-
326
- if ( protein_info.strand == '-' )
327
- prot_seq = prot_seq.reverse
328
- pep_seq = pep_seq.reverse
329
- end
330
-
331
- start_indexes = [0]
332
-
333
- prot_seq.scan /#{pep_seq}/ do |match|
334
- start_indexes << prot_seq.index(match,start_indexes.last)
335
- end
336
- start_indexes.delete_at(0)
337
-
338
- start_indexes.collect do |si|
339
- pep_genomic_start = protein_info.start + 3*si
340
- pep_genomic_end = pep_genomic_start + 3*pep_seq.length - 1
341
- [[pep_genomic_start,pep_genomic_end]]
342
- end
343
-
344
- end
345
-
346
- # Returns a 4-mer [genomic_start,fragment1_end(or0),frag2_start(or0),genomic_end]
347
- def get_peptide_coordinates(prot_seq,pep_seq,protein_info,gene_seq)
348
- if ( protein_info.is_sixframe)
349
- return get_peptide_coordinates_sixframe(prot_seq,pep_seq,protein_info)
350
- else
351
- return get_peptide_coordinates_from_transcript_info(prot_seq,pep_seq,protein_info,gene_seq)
352
- end
353
- end
354
-
355
-
356
- def generate_fragment_gffs_for_coords(coords,protein_info,pep_id,peptide_seq,genomedb,name="fragment")
357
- scaff = get_fasta_record(protein_info.scaffold,genomedb)
358
- scaffold_seq = Bio::Sequence::NA.new(scaff.seq)
359
-
360
- fragment_phase = 0
361
- ordered_coords= protein_info.strand=='+' ? coords : coords.reverse
362
- if name=="CDS"
363
- frag_id="#{pep_id}.fg"
364
- else
365
- frag_id="#{pep_id}.sp"
366
- end
367
- gff_lines = ordered_coords.collect do |frag_start,frag_end|
368
- frag_naseq = scaffold_seq[frag_start-1..frag_end-1]
369
-
370
- begin
371
- frag_frame = fragment_phase+1
372
- frag_seq = nil
373
- if ( protein_info.strand=='-')
374
- frag_seq = frag_naseq.reverse_complement.translate(frag_frame)
375
- else
376
- frag_seq = frag_naseq.translate(frag_frame)
377
- end
378
- rescue
379
- if frag_naseq.length > 1
380
- puts "Unable to translate #{frag_naseq}"
381
- # require 'debugger'
382
- end
383
- frag_seq="*"
384
- end
385
-
386
- fragment_record=Bio::GFF::GFF3::Record.new(seqid = protein_info.scaffold,source="MSMS",
387
- feature_type=name,start_position=frag_start,end_position=frag_end,score='',
388
- strand=protein_info.strand,frame=fragment_phase,attributes=[["Parent",pep_id],["ID",frag_id],["Name",frag_seq]])
389
-
390
-
391
- remainder=(frag_naseq.length-fragment_phase) % 3
392
- fragment_phase=(3-remainder) % 3
393
-
394
- fragment_record
395
- end
396
-
397
-
398
- concat_seq=nil
399
-
400
- coords.each do |frag_start,frag_end|
401
- frag_naseq = scaffold_seq[frag_start-1..frag_end-1]
402
- concat_seq += frag_naseq unless concat_seq == nil
403
- concat_seq = frag_naseq if concat_seq==nil
404
- end
405
-
406
- check_seq = protein_info.strand=='-' ? concat_seq.reverse_complement.translate : concat_seq.translate
407
- if ( check_seq != peptide_seq)
408
- require 'debugger';debugger
409
- puts "Fragment seqs not equal to peptide seqs"
410
- end
411
-
412
- return gff_lines
413
-
414
- end
415
-
416
- def get_start_codon_coords_for_peptide(peptide_genomic_start,peptide_genomic_end,peptide_seq,protein_seq,strand)
417
- pi=protein_seq.index(peptide_seq)
418
- if ( protein_seq[pi]=='M' )
419
- is_tryptic=false
420
- if ( pi>0 && (protein_seq[pi-1]!='K' && protein_seq[pi-1]!='R') )
421
- is_tryptic=true
422
- elsif (pi==0)
423
- is_tryptic=true
424
- end
425
- return nil unless is_tryptic
426
-
427
- start_codon_coord = (strand=='+') ? peptide_genomic_start : peptide_genomic_end-2
428
- # require 'debugger';debugger
429
- return [start_codon_coord,start_codon_coord+2]
430
- else
431
- return nil
432
- end
433
- end
434
-
435
- def get_cterm_coords_for_peptide(peptide_genomic_start,peptide_genomic_end,peptide_seq,protein_seq,strand)
436
-
437
- if ( (peptide_seq[-1]!='K' && peptide_seq[-1]!='R' ) )
438
-
439
- codon_coord = (strand=='+') ? peptide_genomic_end-3 : peptide_genomic_start+1
440
- # require 'debugger';debugger
441
- return [codon_coord,codon_coord+2]
442
- else
443
- return nil
444
- end
445
- end
446
-
447
-
448
- def get_nterm_peptide_for_peptide(peptide_seq,protein_seq)
449
- pi=protein_seq.index(peptide_seq)
450
- if ( pi>0 && (protein_seq[pi-1]!='K' && protein_seq[pi-1]!='R' && protein_seq[pi]!='M') )
451
- # Since trypsin sometimes cleaves before P (ie breaking the rule)
452
- # we don't check for it and assume those cases are real tryptic termini
453
- reverse_leader_seq=protein_seq[0..pi].reverse
454
- mi=reverse_leader_seq.index('M')
455
-
456
- if ( mi==nil )
457
- puts "No methionine found ahead of peptide sequence. Unable to determine n-term sequence"
458
- return nil
459
- end
460
-
461
- mi=pi-mi
462
-
463
- ntermseq=protein_seq[mi..(pi-1)]
464
-
465
- # if ( ntermseq.length < minlen )
466
- # return nil
467
- # end
468
-
469
- # $STDOUT.write protein_seq[mi..(pi+peptide_seq.length-1)]
470
- # require 'debugger';debugger
471
- full_seq_with_annotations = "#{ntermseq}(cleaved)#{protein_seq[(pi..(pi+peptide_seq.length-1))]}"
472
-
473
- return full_seq_with_annotations
474
- else
475
- return nil
476
- end
477
- end
478
-
479
- def generate_gff_for_peptide_mapped_to_protein(protein_seq,peptide_seq,protein_info,prot_id,peptide_prob,peptide_count,dna_sequence,genomedb=nil)
480
-
481
- prot_seq = protein_seq
482
- pep_seq = peptide_seq
483
-
484
-
485
- peptide_coords = get_peptide_coordinates(prot_seq,pep_seq,protein_info,dna_sequence)
486
-
487
- if ( peptide_coords==nil ) # Return value of nil means the entry is a predicted transcript that should already be covered by 6-frame
488
- return []
489
- end
490
-
491
- gff_records=[]
492
-
493
- # Now convert peptide coordinate to genome coordinates
494
- # And create gff lines for each match
495
- peptide_coords.each do |coords|
496
-
497
- # require 'debugger';debugger
498
- pep_genomic_start = coords.first[0]
499
- pep_genomic_end = coords.last[1]
500
-
501
- pep_id = "#{prot_id}.p#{peptide_count.to_s}"
502
- pep_attributes = [["ID",pep_id],["Parent",prot_id],["Name",pep_seq]]
503
-
504
- pep_gff_line = Bio::GFF::GFF3::Record.new(seqid = protein_info.scaffold,source="MSMS",
505
- feature_type="peptide",start_position=pep_genomic_start,end_position=pep_genomic_end,score=peptide_prob,
506
- strand=protein_info.strand,frame=nil,attributes=pep_attributes)
507
-
508
- # For standard peptides
509
- frag_gffs = generate_fragment_gffs_for_coords(coords,protein_info,pep_id,peptide_seq,genomedb,"CDS")
510
- gff_records += [pep_gff_line] + frag_gffs
511
- # require 'debugger';debugger
512
- # For peptides with only 1 tryptic terminus
513
- start_codon_coords=get_start_codon_coords_for_peptide(pep_genomic_start,pep_genomic_end,peptide_seq,protein_seq,protein_info.strand)
514
- if start_codon_coords
515
- start_codon_gff = Bio::GFF::GFF3::Record.new(seqid = protein_info.scaffold,source="MSMS",
516
- feature_type="start_codon",start_position=start_codon_coords[0],end_position=start_codon_coords[1],score='',
517
- strand=protein_info.strand,frame=nil,attributes=["Parent",pep_id])
518
- gff_records+=[start_codon_gff]
519
- end
520
-
521
- cterm_coords = get_cterm_coords_for_peptide(pep_genomic_start,pep_genomic_end,peptide_seq,protein_seq,protein_info.strand)
522
- if ( cterm_coords )
523
- cterm_gff = Bio::GFF::GFF3::Record.new(seqid = protein_info.scaffold,source="MSMS",
524
- feature_type="cterm",start_position=cterm_coords[0],end_position=cterm_coords[1],score='',
525
- strand=protein_info.strand,frame=nil,attributes=["Parent",pep_id])
526
- gff_records+=[start_codon_gff]
527
- end
528
-
529
- end
530
- # puts gff_records
531
-
532
- gff_records
533
- end
534
-
535
- def add_putative_nterm_to_gff(gff_records,peptide_seq,protein_seq,protein_info,prot_id,peptide_count,dna_sequence,genomedb)
536
- pep_id = "#{prot_id}.p#{peptide_count.to_s}"
537
- signal_peptide = get_nterm_peptide_for_peptide(peptide_seq,protein_seq)
538
- if signal_peptide
539
- $stdout.write "Nterm\t#{signal_peptide}\t#{protein_info.name}\t#{protein_seq}\n"
540
- raw_signal_peptide=signal_peptide.sub(/\(cleaved\)/,"")
541
- # Get raw signal_peptide sequence
542
-
543
- signal_peptide_coords=get_peptide_coordinates(protein_seq,raw_signal_peptide,protein_info,dna_sequence)
544
- if signal_peptide_coords
545
- signal_peptide_coords.each do |spcoords|
546
- signal_peptide_gff = generate_fragment_gffs_for_coords(spcoords,protein_info,pep_id,raw_signal_peptide,genomedb,"signalpeptide")
547
- gff_records += signal_peptide_gff
548
- end
549
- end
550
- end
551
- end
552
-
553
- def peptide_gff_is_duplicate(peptide_gff,peptides_covered_genome)
554
- nameindex = peptide_gff.attributes.index {|obj| obj[0]=="Name" }
555
- pep_seq = peptide_gff.attributes[nameindex][1]
556
- existing = peptides_covered_genome[pep_seq]
557
- return true if existing==peptide_gff.start
558
-
559
- return false
560
- end
561
-
562
- proteins = parse_proteins(tool.protxml)
65
+ proteins = parse_proteins(input_proxml)
563
66
  fastadb = prepare_fasta(tool.database,'prot')
564
67
  genomedb = nil
565
68
  if tool.genome
@@ -583,7 +86,7 @@ for prot in proteins
583
86
  end
584
87
 
585
88
  # Gets identifiers of all proteins (includeing indistinguishable ones)
586
- prot_names=protein_names(prot)
89
+ prot_names=tool.protein_names(prot)
587
90
 
588
91
 
589
92
  if tool.protein_find!=nil
@@ -591,19 +94,19 @@ for prot in proteins
591
94
  end
592
95
 
593
96
 
594
- peptides=peptide_nodes(prot)
97
+ peptides=tool.peptide_nodes(prot)
595
98
  entries_covered=[]
596
99
  for protein_name in prot_names
597
100
  protein_count += 1
598
101
  prot_id = "pr#{protein_count.to_s}"
599
102
  begin
600
103
 
601
- protein_fasta_entry = get_fasta_record(protein_name,fastadb)
602
- protein_info = cds_info_from_fasta(protein_fasta_entry)
104
+ protein_fasta_entry = tool.get_fasta_record(protein_name,fastadb)
105
+ protein_info = tool.cds_info_from_fasta(protein_fasta_entry)
603
106
 
604
- unless (tool.collapse_redundant_proteins && !is_new_genome_location(protein_info,entries_covered) )
107
+ unless (tool.collapse_redundant_proteins && !tool.is_new_genome_location(protein_info,entries_covered) )
605
108
 
606
- protein_gff = generate_protein_gff(protein_name,protein_info,prot_prob,protein_count)
109
+ protein_gff = tool.generate_protein_gff(protein_name,protein_info,prot_prob,protein_count)
607
110
 
608
111
  gff_db.records += ["##gff-version 3\n","##sequence-region #{protein_info.scaffold} 1 160\n",protein_gff]
609
112
 
@@ -624,15 +127,15 @@ for prot in proteins
624
127
  dna_sequence=nil
625
128
  if !protein_info.is_sixframe
626
129
  throw "A genome is required if predicted transcripts are to be mapped" unless genomedb!=nil
627
- dna_sequence = get_dna_sequence(protein_info,genomedb)
130
+ dna_sequence = tool.get_dna_sequence(protein_info,genomedb)
628
131
  end
629
132
 
630
133
 
631
- peptide_gff = generate_gff_for_peptide_mapped_to_protein(prot_seq,pep_seq,protein_info,prot_id,pprob,peptide_count,dna_sequence,genomedb)
134
+ peptide_gff = tool.generate_gff_for_peptide_mapped_to_protein(prot_seq,pep_seq,protein_info,prot_id,pprob,peptide_count,dna_sequence,genomedb)
632
135
 
633
- unless (peptide_gff.length==0 || peptide_gff_is_duplicate(peptide_gff[0],peptides_covered_genome))
136
+ unless (peptide_gff.length==0 || tool.peptide_gff_is_duplicate(peptide_gff[0],peptides_covered_genome))
634
137
 
635
- add_putative_nterm_to_gff(peptide_gff,pep_seq,prot_seq,protein_info,prot_id,peptide_count,dna_sequence,genomedb)
138
+ tool.add_putative_nterm_to_gff(peptide_gff,pep_seq,prot_seq,protein_info,prot_id,peptide_count,dna_sequence,genomedb)
636
139
 
637
140
  gff_db.records += peptide_gff
638
141
 
@@ -19,23 +19,9 @@ include LibXML
19
19
  tool=Tool.new([:explicit_output])
20
20
  tool.option_parser.banner = "Convert a protXML file to a tab delimited table.\n\nUsage: protxml_to_table.rb [options] file1.protXML"
21
21
 
22
- tool.options.groups=false
23
- tool.option_parser.on("--groups","Print output by groups rather than for each protein") do
24
- tool.options.groups=true
25
- end
26
-
27
- # tool.options.proteinid_regex=".*?\|.*?\|(.*)"
28
- # tool.option_parser.on( '--regex rexpr', 'Regex' ) do |regex|
29
- # tool.options.proteinid_regex=regex
30
- # end
22
+ tool.add_boolean_option(:groups,false,["--groups","Print output by groups rather than for each protein"])
31
23
 
32
- exit unless tool.check_options
33
-
34
- if ( ARGV[0].nil? )
35
- puts "You must supply an input file"
36
- puts tool.option_parser
37
- exit
38
- end
24
+ exit unless tool.check_options(true)
39
25
 
40
26
  input_file=ARGV[0]
41
27