protk 1.2.6.pre5 → 1.3.0.pre1

Sign up to get free protection for your applications and to get access to all the features.
Files changed (57) hide show
  1. checksums.yaml +4 -4
  2. data/README.md +84 -45
  3. data/bin/add_retention_times.rb +9 -5
  4. data/bin/augustus_to_proteindb.rb +7 -11
  5. data/bin/interprophet.rb +28 -46
  6. data/bin/make_decoy.rb +16 -48
  7. data/bin/mascot_search.rb +57 -71
  8. data/bin/mascot_to_pepxml.rb +13 -26
  9. data/bin/msgfplus_search.rb +70 -107
  10. data/bin/omssa_search.rb +52 -109
  11. data/bin/peptide_prophet.rb +44 -119
  12. data/bin/pepxml_to_table.rb +24 -27
  13. data/bin/protein_prophet.rb +22 -82
  14. data/bin/protxml_to_gff.rb +22 -519
  15. data/bin/protxml_to_table.rb +2 -16
  16. data/bin/sixframe.rb +10 -32
  17. data/bin/tandem_search.rb +30 -403
  18. data/bin/tandem_to_pepxml.rb +43 -0
  19. data/bin/unimod_to_loc.rb +1 -1
  20. data/ext/{protk/decoymaker → decoymaker}/decoymaker.c +74 -21
  21. data/ext/decoymaker/extconf.rb +3 -0
  22. data/lib/protk/constants.rb +16 -2
  23. data/lib/protk/data/default_config.yml +2 -1
  24. data/lib/protk/data/tandem_gpm_defaults.xml +175 -0
  25. data/lib/protk/data/tandem_isb_kscore_defaults.xml +123 -0
  26. data/lib/protk/data/tandem_isb_native_defaults.xml +123 -0
  27. data/lib/protk/data/tandem_params.xml +17 -54
  28. data/lib/protk/fastadb.rb +2 -2
  29. data/lib/protk/prophet_tool.rb +1 -1
  30. data/lib/protk/protxml_to_gff_tool.rb +474 -0
  31. data/lib/protk/search_tool.rb +58 -103
  32. data/lib/protk/setup_rakefile.rake +9 -5
  33. data/lib/protk/tandem_search_tool.rb +256 -0
  34. data/lib/protk/tool.rb +85 -104
  35. data/lib/protk.rb +1 -6
  36. metadata +24 -103
  37. data/bin/annotate_ids.rb +0 -59
  38. data/bin/asapratio.rb +0 -27
  39. data/bin/blastxml_to_table.rb +0 -119
  40. data/bin/correct_omssa_retention_times.rb +0 -27
  41. data/bin/feature_finder.rb +0 -95
  42. data/bin/file_convert.rb +0 -164
  43. data/bin/generate_omssa_loc.rb +0 -42
  44. data/bin/gffmerge.rb +0 -208
  45. data/bin/libra.rb +0 -70
  46. data/bin/toppas_pipeline.rb +0 -84
  47. data/bin/uniprot_annotation.rb +0 -141
  48. data/bin/xls_to_table.rb +0 -52
  49. data/bin/xpress.rb +0 -27
  50. data/ext/protk/decoymaker/extconf.rb +0 -3
  51. data/ext/protk/simplealign/extconf.rb +0 -3
  52. data/lib/protk/biotools_excel_converter.rb +0 -60
  53. data/lib/protk/eupathdb_gene_information_table.rb +0 -158
  54. data/lib/protk/gapped_aligner.rb +0 -264
  55. data/lib/protk/protein_annotator.rb +0 -646
  56. data/lib/protk/spreadsheet_extensions.rb +0 -79
  57. data/lib/protk/xtandem_defaults.rb +0 -11
@@ -7,73 +7,26 @@
7
7
  #
8
8
 
9
9
  require 'protk/constants'
10
- require 'protk/tool'
10
+ require 'protk/protxml_to_gff_tool'
11
11
  require 'protk/fastadb'
12
- require 'protk/gapped_aligner'
13
12
  require 'libxml'
14
13
  require 'bio'
15
14
 
16
15
  include LibXML
17
16
 
18
- tool=Tool.new([:explicit_output])
19
- tool.option_parser.banner = "Create a gff containing peptide Observations.\n\nUsage: protxml_to_gff.rb "
17
+ tool=ProtXMLToGFFTool.new()
20
18
 
19
+ @output_extension=".gff"
20
+ @output_suffix=""
21
21
 
22
- tool.options.protxml=nil
23
- tool.option_parser.on( '-p filename','--protxml filename', 'Observed Data (ProtXML Format)' ) do |file|
24
- tool.options.protxml=file
25
- end
26
-
27
- tool.options.database=nil
28
- tool.option_parser.on( '-d filename','--database filename', 'Database used for ms/ms searches (Fasta Format)' ) do |file|
29
- tool.options.database=file
30
- end
22
+ exit unless tool.check_options(true,[:database])
31
23
 
32
- tool.options.protein_find=nil
33
- tool.option_parser.on( '-f term','--find term', 'Restrict output to proteins whose name matches the specified string' ) do |term|
34
- tool.options.protein_find=term
35
- end
36
-
37
- tool.options.nterm_minlen=7
38
- tool.option_parser.on( '-n len','--nterm-min-len len', 'Only include inferred N-terminal sequences if longer than len' ) do |len|
39
- tool.options.nterm_minlen=len
40
- end
24
+ input_proxml=ARGV[0]
41
25
 
42
- tool.options.genome=nil
43
- tool.option_parser.on( '-g filename','--genome filename', 'Nucleotide sequences for scaffolds (Fasta Format)' ) do |file|
44
- tool.options.genome=file
45
- end
46
-
47
- tool.options.skip_fasta_indexing=false
48
- tool.option_parser.on('--skip-index','Don\'t index database (Index should already exist)') do
49
- tool.options.skip_fasta_indexing=true
50
- end
51
-
52
- tool.options.stack_charge_states=false
53
- tool.option_parser.on('--stack-charge-states','Different peptide charge states get separate gff entries') do
54
- tool.options.stack_charge_states=true
55
- end
56
-
57
- tool.options.collapse_redundant_proteins=false
58
- tool.option_parser.on('--collapse-redundant-proteins','Proteins that cover genomic regions already covered will be skipped') do
59
- tool.options.collapse_redundant_proteins=true
60
- end
61
-
62
- tool.options.peptide_probability_threshold=0.95
63
- tool.option_parser.on('--threshold prob','Peptide Probability Threshold (Default 0.95)') do |thresh|
64
- tool.options.peptide_probability_threshold=thresh.to_f
65
- end
66
-
67
- tool.options.protein_probability_threshold=0.99
68
- tool.option_parser.on('--prot-threshold prob','Protein Probability Threshold (Default 0.99)') do |thresh|
69
- tool.options.protein_probability_threshold=thresh.to_f
70
- end
71
-
72
- exit unless tool.check_options [:protxml,:database]
73
-
74
- gff_out_file="peptides.gff"
75
- if ( tool.explicit_output != nil)
76
- gff_out_file=tool.explicit_output
26
+ if ( tool.explicit_output!=nil)
27
+ gff_out_file=tool.explicit_output
28
+ else
29
+ gff_out_file=Tool.default_output_path(input_proxml,@output_extension,tool.output_prefix,@output_suffix)
77
30
  end
78
31
 
79
32
  gff_db = Bio::GFF.new()
@@ -92,7 +45,7 @@ def prepare_fasta(database_path,type)
92
45
  db_filename = nil
93
46
  case
94
47
  when Pathname.new(database_path).exist? # It's an explicitly named db
95
- db_filename = Pathname.new(database_path).realpath.to_s
48
+ db_filename = Pathname.new(database_path).expand_path.to_s
96
49
  else
97
50
  db_filename=Constants.new.current_database_for_name(database_path)
98
51
  end
@@ -109,457 +62,7 @@ def prepare_fasta(database_path,type)
109
62
  orf_lookup
110
63
  end
111
64
 
112
- def protein_names(protein_node)
113
- indis_proteins = protein_node.find('protxml:indistinguishable_protein','protxml:http://regis-web.systemsbiology.net/protXML')
114
- prot_names = [protein_node['protein_name']]
115
- for protein in indis_proteins
116
- prot_names += [protein['protein_name']]
117
- end
118
- prot_names
119
- end
120
-
121
- def peptide_nodes(protein_node)
122
- return protein_node.find('protxml:peptide','protxml:http://regis-web.systemsbiology.net/protXML')
123
- end
124
-
125
-
126
- def get_fasta_record(protein_name,fastadb)
127
- # puts "Looking up #{protein_name}"
128
- entry = fastadb.get_by_id protein_name
129
- if ( entry == nil)
130
- puts "Failed lookup for #{protein_name}"
131
- raise KeyError
132
- end
133
- entry
134
- end
135
-
136
- class CDSInfo
137
- attr_accessor :fasta_id
138
- attr_accessor :strand
139
- attr_accessor :frame
140
- attr_accessor :name
141
- attr_accessor :scaffold
142
- attr_accessor :start
143
- attr_accessor :end
144
- attr_accessor :coding_sequences
145
- attr_accessor :is_sixframe
146
- attr_accessor :gene_id
147
-
148
- def overlap(candidate_entry)
149
- return false if candidate_entry.scaffold!=self.scaffold
150
- return false if strand!=self.strand
151
- return false if candidate_entry.start >= self.end
152
- return false if self.start <= candidate_entry.end
153
- return true
154
- end
155
-
156
- end
157
-
158
- def cds_info_from_fasta(fasta_entry)
159
- info=CDSInfo.new
160
- info.fasta_id=fasta_entry
161
- positions = fasta_entry.identifiers.description.split(' ').collect { |coords| coords.split('|').collect {|pos| pos.to_i} }
162
- info.coding_sequences=[]
163
- info.gene_id
164
- if ( positions.length < 1 )
165
- raise EncodingError
166
- elsif ( positions.length > 1)
167
- info.coding_sequences = positions[1..-1]
168
- end
169
-
170
- info.start = positions[0][0]
171
- info.end = positions[0][1]
172
-
173
- info.scaffold=fasta_entry.entry_id.scan(/(scaffold_?\d+)_/)[0][0]
174
- info.name = fasta_entry.entry_id.scan(/lcl\|(.*)/)[0][0]
175
-
176
- if fasta_entry.entry_id =~ /frame/
177
- info.frame=info.name.scan(/frame_(\d)/)[0][0]
178
- info.strand = (info.frame.to_i > 3) ? '-' : '+'
179
- info.is_sixframe = true
180
- else
181
- info.strand = (info.name =~ /rev/) ? '-' : '+'
182
- info.gene_id=info.name.scan(/_\w{3}_(.*)\.t/)[0][0]
183
- info.is_sixframe = false
184
- end
185
- info
186
- end
187
-
188
-
189
- def is_new_genome_location(candidate_entry,existing_entries)
190
- # puts existing_entries
191
- # require 'debugger';debugger
192
-
193
- # genes=existing_entries.collect { |e| e.gene_id }.compact
194
-
195
- # if genes.include?(candidate_entry.gene_id)
196
- # return false
197
- # end
198
-
199
- existing_entries.each do |existing|
200
- return false if existing.gene_id==candidate_entry.gene_id
201
- return false if existing.overlap(candidate_entry)
202
- end
203
-
204
- return true
205
- end
206
-
207
-
208
- def generate_protein_gff(protein_name,entry_info,prot_prob,prot_id)
209
- prot_qualifiers = {"source" => "MSMS", "score" => prot_prob, "ID" => prot_id}
210
- prot_attributes = [["ID",prot_id],["Name",entry_info.name]]
211
- prot_gff_line = Bio::GFF::GFF3::Record.new(seqid = entry_info.scaffold,source="MSMS",feature_type="protein",
212
- start_position=entry_info.start,end_position=entry_info.end,score=prot_prob,strand=entry_info.strand,frame=nil,attributes=prot_attributes)
213
- prot_gff_line
214
- end
215
-
216
- def get_dna_sequence(protein_info,genomedb)
217
-
218
- scaffold_sequence = get_fasta_record(protein_info.scaffold,genomedb)
219
- gene_sequence = scaffold_sequence.naseq.to_s[(protein_info.start-1)..protein_info.end]
220
-
221
- if ( protein_info.strand == "-")
222
- gene_sequence = Bio::Sequence::NA.new(gene_sequence).reverse_complement
223
- end
224
-
225
- gene_sequence
226
- end
227
-
228
- def peptide_is_in_sixframe(pep_seq,gene_seq)
229
- gs=Bio::Sequence::NA.new(gene_seq)
230
- (1..6).each do |frame|
231
- if gs.translate(frame).index(pep_seq)
232
- return true
233
- end
234
- end
235
- return false
236
- end
237
-
238
- def fragment_coords_from_protein_coords(pepstart,pepend,gene_start,gene_end,coding_sequences)
239
-
240
- sorted_cds = coding_sequences.sort { |a, b| a[0] <=> b[0] }
241
-
242
-
243
- # Assume positive strand
244
- pi_start=pepstart*3+gene_start-1
245
- pi_end=pepend*3+gene_start-1
246
-
247
- fragments=[]
248
- p_i = pi_start #Initially we are looking for the first fragment
249
- finding_start=true
250
-
251
- sorted_cds.each_with_index do |cds_coords, i|
252
- cds_start=cds_coords[0]
253
- cds_end = cds_coords[1]
254
- if cds_end < p_i # Exon is before index in sequence and doesn't contain p_i
255
- if sorted_cds.length <= i+1
256
- require 'debugger';debugger
257
- end
258
-
259
- next_coords = sorted_cds[i+1]
260
- intron_offset = ((next_coords[0]-cds_end)-1)
261
- p_i+=intron_offset
262
- pi_end+=intron_offset
263
- if !finding_start
264
- # This is a middle exon
265
- fragments << [cds_start,cds_end]
266
- end
267
- else
268
- if finding_start
269
-
270
- if ( pi_end <= cds_end) #Whole peptide contained in a single exon
271
- fragments << [p_i+1,pi_end]
272
- break;
273
- end
274
-
275
-
276
- fragments << [p_i+1,(cds_end)]
277
- next_coords = sorted_cds[i+1]
278
- intron_offset = ((next_coords[0]-cds_end)-1)
279
- p_i+=intron_offset
280
- pi_end+=intron_offset
281
- p_i = pi_end
282
- finding_start=false
283
- else # A terminal exon
284
- # require 'debugger';debugger
285
- fragments << [(cds_start),(p_i)]
286
- break;
287
- end
288
- end
289
- end
290
- [fragments]
291
- end
292
-
293
- # gene_seq should already have been reverse_complemented if on reverse strand
294
- def get_peptide_coordinates_from_transcript_info(prot_seq,pep_seq,protein_info,gene_seq)
295
- # if ( peptide_is_in_sixframe(pep_seq,gene_seq))
296
- # Peptide is in 6-frame but on a predicted transcript
297
- # return nil
298
- # else
299
-
300
- # puts "Found a gap #{protein_info.fasta_id}"
301
- if protein_info.strand=='-'
302
- pep_index = prot_seq.reverse.index(pep_seq.reverse)
303
- if pep_index==nil
304
- # require 'debugger';debugger
305
- puts "Warning: Unable to find peptide #{pep_seq} in this protein! #{protein_info}"
306
- return nil
307
- end
308
- pep_start_i = prot_seq.reverse.index(pep_seq.reverse)+1
309
- # Plus 1 because on reverse stand stop-codon will be at the beginning of the sequence (when read forwards). Need to eliminate it.
310
- else
311
- pep_start_i = prot_seq.index(pep_seq)
312
- if pep_start_i==nil
313
- # require 'debugger';debugger
314
- puts "Warning: Unable to find peptide #{pep_seq} in this protein! #{protein_info}"
315
- return nil
316
- end
317
- end
318
- pep_end_i = pep_start_i+pep_seq.length
319
-
320
- return fragment_coords_from_protein_coords(pep_start_i,pep_end_i,protein_info.start,protein_info.end,protein_info.coding_sequences)
321
- # end
322
- end
323
-
324
- def get_peptide_coordinates_sixframe(prot_seq,pep_seq,protein_info)
325
-
326
- if ( protein_info.strand == '-' )
327
- prot_seq = prot_seq.reverse
328
- pep_seq = pep_seq.reverse
329
- end
330
-
331
- start_indexes = [0]
332
-
333
- prot_seq.scan /#{pep_seq}/ do |match|
334
- start_indexes << prot_seq.index(match,start_indexes.last)
335
- end
336
- start_indexes.delete_at(0)
337
-
338
- start_indexes.collect do |si|
339
- pep_genomic_start = protein_info.start + 3*si
340
- pep_genomic_end = pep_genomic_start + 3*pep_seq.length - 1
341
- [[pep_genomic_start,pep_genomic_end]]
342
- end
343
-
344
- end
345
-
346
- # Returns a 4-mer [genomic_start,fragment1_end(or0),frag2_start(or0),genomic_end]
347
- def get_peptide_coordinates(prot_seq,pep_seq,protein_info,gene_seq)
348
- if ( protein_info.is_sixframe)
349
- return get_peptide_coordinates_sixframe(prot_seq,pep_seq,protein_info)
350
- else
351
- return get_peptide_coordinates_from_transcript_info(prot_seq,pep_seq,protein_info,gene_seq)
352
- end
353
- end
354
-
355
-
356
- def generate_fragment_gffs_for_coords(coords,protein_info,pep_id,peptide_seq,genomedb,name="fragment")
357
- scaff = get_fasta_record(protein_info.scaffold,genomedb)
358
- scaffold_seq = Bio::Sequence::NA.new(scaff.seq)
359
-
360
- fragment_phase = 0
361
- ordered_coords= protein_info.strand=='+' ? coords : coords.reverse
362
- if name=="CDS"
363
- frag_id="#{pep_id}.fg"
364
- else
365
- frag_id="#{pep_id}.sp"
366
- end
367
- gff_lines = ordered_coords.collect do |frag_start,frag_end|
368
- frag_naseq = scaffold_seq[frag_start-1..frag_end-1]
369
-
370
- begin
371
- frag_frame = fragment_phase+1
372
- frag_seq = nil
373
- if ( protein_info.strand=='-')
374
- frag_seq = frag_naseq.reverse_complement.translate(frag_frame)
375
- else
376
- frag_seq = frag_naseq.translate(frag_frame)
377
- end
378
- rescue
379
- if frag_naseq.length > 1
380
- puts "Unable to translate #{frag_naseq}"
381
- # require 'debugger'
382
- end
383
- frag_seq="*"
384
- end
385
-
386
- fragment_record=Bio::GFF::GFF3::Record.new(seqid = protein_info.scaffold,source="MSMS",
387
- feature_type=name,start_position=frag_start,end_position=frag_end,score='',
388
- strand=protein_info.strand,frame=fragment_phase,attributes=[["Parent",pep_id],["ID",frag_id],["Name",frag_seq]])
389
-
390
-
391
- remainder=(frag_naseq.length-fragment_phase) % 3
392
- fragment_phase=(3-remainder) % 3
393
-
394
- fragment_record
395
- end
396
-
397
-
398
- concat_seq=nil
399
-
400
- coords.each do |frag_start,frag_end|
401
- frag_naseq = scaffold_seq[frag_start-1..frag_end-1]
402
- concat_seq += frag_naseq unless concat_seq == nil
403
- concat_seq = frag_naseq if concat_seq==nil
404
- end
405
-
406
- check_seq = protein_info.strand=='-' ? concat_seq.reverse_complement.translate : concat_seq.translate
407
- if ( check_seq != peptide_seq)
408
- require 'debugger';debugger
409
- puts "Fragment seqs not equal to peptide seqs"
410
- end
411
-
412
- return gff_lines
413
-
414
- end
415
-
416
- def get_start_codon_coords_for_peptide(peptide_genomic_start,peptide_genomic_end,peptide_seq,protein_seq,strand)
417
- pi=protein_seq.index(peptide_seq)
418
- if ( protein_seq[pi]=='M' )
419
- is_tryptic=false
420
- if ( pi>0 && (protein_seq[pi-1]!='K' && protein_seq[pi-1]!='R') )
421
- is_tryptic=true
422
- elsif (pi==0)
423
- is_tryptic=true
424
- end
425
- return nil unless is_tryptic
426
-
427
- start_codon_coord = (strand=='+') ? peptide_genomic_start : peptide_genomic_end-2
428
- # require 'debugger';debugger
429
- return [start_codon_coord,start_codon_coord+2]
430
- else
431
- return nil
432
- end
433
- end
434
-
435
- def get_cterm_coords_for_peptide(peptide_genomic_start,peptide_genomic_end,peptide_seq,protein_seq,strand)
436
-
437
- if ( (peptide_seq[-1]!='K' && peptide_seq[-1]!='R' ) )
438
-
439
- codon_coord = (strand=='+') ? peptide_genomic_end-3 : peptide_genomic_start+1
440
- # require 'debugger';debugger
441
- return [codon_coord,codon_coord+2]
442
- else
443
- return nil
444
- end
445
- end
446
-
447
-
448
- def get_nterm_peptide_for_peptide(peptide_seq,protein_seq)
449
- pi=protein_seq.index(peptide_seq)
450
- if ( pi>0 && (protein_seq[pi-1]!='K' && protein_seq[pi-1]!='R' && protein_seq[pi]!='M') )
451
- # Since trypsin sometimes cleaves before P (ie breaking the rule)
452
- # we don't check for it and assume those cases are real tryptic termini
453
- reverse_leader_seq=protein_seq[0..pi].reverse
454
- mi=reverse_leader_seq.index('M')
455
-
456
- if ( mi==nil )
457
- puts "No methionine found ahead of peptide sequence. Unable to determine n-term sequence"
458
- return nil
459
- end
460
-
461
- mi=pi-mi
462
-
463
- ntermseq=protein_seq[mi..(pi-1)]
464
-
465
- # if ( ntermseq.length < minlen )
466
- # return nil
467
- # end
468
-
469
- # $STDOUT.write protein_seq[mi..(pi+peptide_seq.length-1)]
470
- # require 'debugger';debugger
471
- full_seq_with_annotations = "#{ntermseq}(cleaved)#{protein_seq[(pi..(pi+peptide_seq.length-1))]}"
472
-
473
- return full_seq_with_annotations
474
- else
475
- return nil
476
- end
477
- end
478
-
479
- def generate_gff_for_peptide_mapped_to_protein(protein_seq,peptide_seq,protein_info,prot_id,peptide_prob,peptide_count,dna_sequence,genomedb=nil)
480
-
481
- prot_seq = protein_seq
482
- pep_seq = peptide_seq
483
-
484
-
485
- peptide_coords = get_peptide_coordinates(prot_seq,pep_seq,protein_info,dna_sequence)
486
-
487
- if ( peptide_coords==nil ) # Return value of nil means the entry is a predicted transcript that should already be covered by 6-frame
488
- return []
489
- end
490
-
491
- gff_records=[]
492
-
493
- # Now convert peptide coordinate to genome coordinates
494
- # And create gff lines for each match
495
- peptide_coords.each do |coords|
496
-
497
- # require 'debugger';debugger
498
- pep_genomic_start = coords.first[0]
499
- pep_genomic_end = coords.last[1]
500
-
501
- pep_id = "#{prot_id}.p#{peptide_count.to_s}"
502
- pep_attributes = [["ID",pep_id],["Parent",prot_id],["Name",pep_seq]]
503
-
504
- pep_gff_line = Bio::GFF::GFF3::Record.new(seqid = protein_info.scaffold,source="MSMS",
505
- feature_type="peptide",start_position=pep_genomic_start,end_position=pep_genomic_end,score=peptide_prob,
506
- strand=protein_info.strand,frame=nil,attributes=pep_attributes)
507
-
508
- # For standard peptides
509
- frag_gffs = generate_fragment_gffs_for_coords(coords,protein_info,pep_id,peptide_seq,genomedb,"CDS")
510
- gff_records += [pep_gff_line] + frag_gffs
511
- # require 'debugger';debugger
512
- # For peptides with only 1 tryptic terminus
513
- start_codon_coords=get_start_codon_coords_for_peptide(pep_genomic_start,pep_genomic_end,peptide_seq,protein_seq,protein_info.strand)
514
- if start_codon_coords
515
- start_codon_gff = Bio::GFF::GFF3::Record.new(seqid = protein_info.scaffold,source="MSMS",
516
- feature_type="start_codon",start_position=start_codon_coords[0],end_position=start_codon_coords[1],score='',
517
- strand=protein_info.strand,frame=nil,attributes=["Parent",pep_id])
518
- gff_records+=[start_codon_gff]
519
- end
520
-
521
- cterm_coords = get_cterm_coords_for_peptide(pep_genomic_start,pep_genomic_end,peptide_seq,protein_seq,protein_info.strand)
522
- if ( cterm_coords )
523
- cterm_gff = Bio::GFF::GFF3::Record.new(seqid = protein_info.scaffold,source="MSMS",
524
- feature_type="cterm",start_position=cterm_coords[0],end_position=cterm_coords[1],score='',
525
- strand=protein_info.strand,frame=nil,attributes=["Parent",pep_id])
526
- gff_records+=[start_codon_gff]
527
- end
528
-
529
- end
530
- # puts gff_records
531
-
532
- gff_records
533
- end
534
-
535
- def add_putative_nterm_to_gff(gff_records,peptide_seq,protein_seq,protein_info,prot_id,peptide_count,dna_sequence,genomedb)
536
- pep_id = "#{prot_id}.p#{peptide_count.to_s}"
537
- signal_peptide = get_nterm_peptide_for_peptide(peptide_seq,protein_seq)
538
- if signal_peptide
539
- $stdout.write "Nterm\t#{signal_peptide}\t#{protein_info.name}\t#{protein_seq}\n"
540
- raw_signal_peptide=signal_peptide.sub(/\(cleaved\)/,"")
541
- # Get raw signal_peptide sequence
542
-
543
- signal_peptide_coords=get_peptide_coordinates(protein_seq,raw_signal_peptide,protein_info,dna_sequence)
544
- if signal_peptide_coords
545
- signal_peptide_coords.each do |spcoords|
546
- signal_peptide_gff = generate_fragment_gffs_for_coords(spcoords,protein_info,pep_id,raw_signal_peptide,genomedb,"signalpeptide")
547
- gff_records += signal_peptide_gff
548
- end
549
- end
550
- end
551
- end
552
-
553
- def peptide_gff_is_duplicate(peptide_gff,peptides_covered_genome)
554
- nameindex = peptide_gff.attributes.index {|obj| obj[0]=="Name" }
555
- pep_seq = peptide_gff.attributes[nameindex][1]
556
- existing = peptides_covered_genome[pep_seq]
557
- return true if existing==peptide_gff.start
558
-
559
- return false
560
- end
561
-
562
- proteins = parse_proteins(tool.protxml)
65
+ proteins = parse_proteins(input_proxml)
563
66
  fastadb = prepare_fasta(tool.database,'prot')
564
67
  genomedb = nil
565
68
  if tool.genome
@@ -583,7 +86,7 @@ for prot in proteins
583
86
  end
584
87
 
585
88
  # Gets identifiers of all proteins (includeing indistinguishable ones)
586
- prot_names=protein_names(prot)
89
+ prot_names=tool.protein_names(prot)
587
90
 
588
91
 
589
92
  if tool.protein_find!=nil
@@ -591,19 +94,19 @@ for prot in proteins
591
94
  end
592
95
 
593
96
 
594
- peptides=peptide_nodes(prot)
97
+ peptides=tool.peptide_nodes(prot)
595
98
  entries_covered=[]
596
99
  for protein_name in prot_names
597
100
  protein_count += 1
598
101
  prot_id = "pr#{protein_count.to_s}"
599
102
  begin
600
103
 
601
- protein_fasta_entry = get_fasta_record(protein_name,fastadb)
602
- protein_info = cds_info_from_fasta(protein_fasta_entry)
104
+ protein_fasta_entry = tool.get_fasta_record(protein_name,fastadb)
105
+ protein_info = tool.cds_info_from_fasta(protein_fasta_entry)
603
106
 
604
- unless (tool.collapse_redundant_proteins && !is_new_genome_location(protein_info,entries_covered) )
107
+ unless (tool.collapse_redundant_proteins && !tool.is_new_genome_location(protein_info,entries_covered) )
605
108
 
606
- protein_gff = generate_protein_gff(protein_name,protein_info,prot_prob,protein_count)
109
+ protein_gff = tool.generate_protein_gff(protein_name,protein_info,prot_prob,protein_count)
607
110
 
608
111
  gff_db.records += ["##gff-version 3\n","##sequence-region #{protein_info.scaffold} 1 160\n",protein_gff]
609
112
 
@@ -624,15 +127,15 @@ for prot in proteins
624
127
  dna_sequence=nil
625
128
  if !protein_info.is_sixframe
626
129
  throw "A genome is required if predicted transcripts are to be mapped" unless genomedb!=nil
627
- dna_sequence = get_dna_sequence(protein_info,genomedb)
130
+ dna_sequence = tool.get_dna_sequence(protein_info,genomedb)
628
131
  end
629
132
 
630
133
 
631
- peptide_gff = generate_gff_for_peptide_mapped_to_protein(prot_seq,pep_seq,protein_info,prot_id,pprob,peptide_count,dna_sequence,genomedb)
134
+ peptide_gff = tool.generate_gff_for_peptide_mapped_to_protein(prot_seq,pep_seq,protein_info,prot_id,pprob,peptide_count,dna_sequence,genomedb)
632
135
 
633
- unless (peptide_gff.length==0 || peptide_gff_is_duplicate(peptide_gff[0],peptides_covered_genome))
136
+ unless (peptide_gff.length==0 || tool.peptide_gff_is_duplicate(peptide_gff[0],peptides_covered_genome))
634
137
 
635
- add_putative_nterm_to_gff(peptide_gff,pep_seq,prot_seq,protein_info,prot_id,peptide_count,dna_sequence,genomedb)
138
+ tool.add_putative_nterm_to_gff(peptide_gff,pep_seq,prot_seq,protein_info,prot_id,peptide_count,dna_sequence,genomedb)
636
139
 
637
140
  gff_db.records += peptide_gff
638
141
 
@@ -19,23 +19,9 @@ include LibXML
19
19
  tool=Tool.new([:explicit_output])
20
20
  tool.option_parser.banner = "Convert a protXML file to a tab delimited table.\n\nUsage: protxml_to_table.rb [options] file1.protXML"
21
21
 
22
- tool.options.groups=false
23
- tool.option_parser.on("--groups","Print output by groups rather than for each protein") do
24
- tool.options.groups=true
25
- end
26
-
27
- # tool.options.proteinid_regex=".*?\|.*?\|(.*)"
28
- # tool.option_parser.on( '--regex rexpr', 'Regex' ) do |regex|
29
- # tool.options.proteinid_regex=regex
30
- # end
22
+ tool.add_boolean_option(:groups,false,["--groups","Print output by groups rather than for each protein"])
31
23
 
32
- exit unless tool.check_options
33
-
34
- if ( ARGV[0].nil? )
35
- puts "You must supply an input file"
36
- puts tool.option_parser
37
- exit
38
- end
24
+ exit unless tool.check_options(true)
39
25
 
40
26
  input_file=ARGV[0]
41
27