protk 1.2.6.pre5 → 1.3.0.pre1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (57) hide show
  1. checksums.yaml +4 -4
  2. data/README.md +84 -45
  3. data/bin/add_retention_times.rb +9 -5
  4. data/bin/augustus_to_proteindb.rb +7 -11
  5. data/bin/interprophet.rb +28 -46
  6. data/bin/make_decoy.rb +16 -48
  7. data/bin/mascot_search.rb +57 -71
  8. data/bin/mascot_to_pepxml.rb +13 -26
  9. data/bin/msgfplus_search.rb +70 -107
  10. data/bin/omssa_search.rb +52 -109
  11. data/bin/peptide_prophet.rb +44 -119
  12. data/bin/pepxml_to_table.rb +24 -27
  13. data/bin/protein_prophet.rb +22 -82
  14. data/bin/protxml_to_gff.rb +22 -519
  15. data/bin/protxml_to_table.rb +2 -16
  16. data/bin/sixframe.rb +10 -32
  17. data/bin/tandem_search.rb +30 -403
  18. data/bin/tandem_to_pepxml.rb +43 -0
  19. data/bin/unimod_to_loc.rb +1 -1
  20. data/ext/{protk/decoymaker → decoymaker}/decoymaker.c +74 -21
  21. data/ext/decoymaker/extconf.rb +3 -0
  22. data/lib/protk/constants.rb +16 -2
  23. data/lib/protk/data/default_config.yml +2 -1
  24. data/lib/protk/data/tandem_gpm_defaults.xml +175 -0
  25. data/lib/protk/data/tandem_isb_kscore_defaults.xml +123 -0
  26. data/lib/protk/data/tandem_isb_native_defaults.xml +123 -0
  27. data/lib/protk/data/tandem_params.xml +17 -54
  28. data/lib/protk/fastadb.rb +2 -2
  29. data/lib/protk/prophet_tool.rb +1 -1
  30. data/lib/protk/protxml_to_gff_tool.rb +474 -0
  31. data/lib/protk/search_tool.rb +58 -103
  32. data/lib/protk/setup_rakefile.rake +9 -5
  33. data/lib/protk/tandem_search_tool.rb +256 -0
  34. data/lib/protk/tool.rb +85 -104
  35. data/lib/protk.rb +1 -6
  36. metadata +24 -103
  37. data/bin/annotate_ids.rb +0 -59
  38. data/bin/asapratio.rb +0 -27
  39. data/bin/blastxml_to_table.rb +0 -119
  40. data/bin/correct_omssa_retention_times.rb +0 -27
  41. data/bin/feature_finder.rb +0 -95
  42. data/bin/file_convert.rb +0 -164
  43. data/bin/generate_omssa_loc.rb +0 -42
  44. data/bin/gffmerge.rb +0 -208
  45. data/bin/libra.rb +0 -70
  46. data/bin/toppas_pipeline.rb +0 -84
  47. data/bin/uniprot_annotation.rb +0 -141
  48. data/bin/xls_to_table.rb +0 -52
  49. data/bin/xpress.rb +0 -27
  50. data/ext/protk/decoymaker/extconf.rb +0 -3
  51. data/ext/protk/simplealign/extconf.rb +0 -3
  52. data/lib/protk/biotools_excel_converter.rb +0 -60
  53. data/lib/protk/eupathdb_gene_information_table.rb +0 -158
  54. data/lib/protk/gapped_aligner.rb +0 -264
  55. data/lib/protk/protein_annotator.rb +0 -646
  56. data/lib/protk/spreadsheet_extensions.rb +0 -79
  57. data/lib/protk/xtandem_defaults.rb +0 -11
@@ -0,0 +1,474 @@
1
+ require 'protk/tool'
2
+
3
+ class ProtXMLToGFFTool < Tool
4
+
5
+ def initialize
6
+
7
+ super([:explicit_output])
8
+
9
+ @option_parser.banner = "Create a gff containing peptide Observations.\n\nUsage: protxml_to_gff.rb "
10
+
11
+ add_value_option(:database,nil,['-d filename','--database filename','Database used for ms/ms searches (Fasta Format)'])
12
+ add_value_option(:genome,nil,['-g filename','--genome filename', 'Nucleotide sequences for scaffolds (Fasta Format)'])
13
+ add_value_option(:protein_find,nil,['-f term','--find term', 'Restrict output to proteins whose name matches the specified string'])
14
+ add_value_option(:nterm_minlen,7,['-n len','--nterm-min-len len', 'Only include inferred N-terminal sequences if longer than len'])
15
+ add_boolean_option(:skip_fasta_indexing,false,['--skip-index','Don\'t index database (Index should already exist)'])
16
+ add_boolean_option(:stack_charge_states,false,['--stack-charge-states','Different peptide charge states get separate gff entries'])
17
+ add_boolean_option(:collapse_redundant_proteins,false,['--collapse-redundant-proteins','Proteins that cover genomic regions already covered will be skipped'])
18
+ add_value_option(:peptide_probability_threshold,0.95,['--threshold prob','Peptide Probability Threshold (Default 0.95)'])
19
+ add_value_option(:protein_probability_threshold,0.99,['--prot-threshold prob','Protein Probability Threshold (Default 0.99)'])
20
+
21
+ end
22
+
23
+
24
+ def protein_names(protein_node)
25
+ indis_proteins = protein_node.find('protxml:indistinguishable_protein','protxml:http://regis-web.systemsbiology.net/protXML')
26
+ prot_names = [protein_node['protein_name']]
27
+ for protein in indis_proteins
28
+ prot_names += [protein['protein_name']]
29
+ end
30
+ prot_names
31
+ end
32
+
33
+ def peptide_nodes(protein_node)
34
+ return protein_node.find('protxml:peptide','protxml:http://regis-web.systemsbiology.net/protXML')
35
+ end
36
+
37
+
38
+ def get_fasta_record(protein_name,fastadb)
39
+ # puts "Looking up #{protein_name}"
40
+ entry = fastadb.get_by_id protein_name
41
+ if ( entry == nil)
42
+ puts "Failed lookup for #{protein_name}"
43
+ raise KeyError
44
+ end
45
+ entry
46
+ end
47
+
48
+ class CDSInfo
49
+ attr_accessor :fasta_id
50
+ attr_accessor :strand
51
+ attr_accessor :frame
52
+ attr_accessor :name
53
+ attr_accessor :scaffold
54
+ attr_accessor :start
55
+ attr_accessor :end
56
+ attr_accessor :coding_sequences
57
+ attr_accessor :is_sixframe
58
+ attr_accessor :gene_id
59
+
60
+ def overlap(candidate_entry)
61
+ return false if candidate_entry.scaffold!=self.scaffold
62
+ return false if strand!=self.strand
63
+ return false if candidate_entry.start >= self.end
64
+ return false if self.start <= candidate_entry.end
65
+ return true
66
+ end
67
+
68
+ end
69
+
70
+ def cds_info_from_fasta(fasta_entry)
71
+ info=CDSInfo.new
72
+ info.fasta_id=fasta_entry
73
+ positions = fasta_entry.identifiers.description.split(' ').collect { |coords| coords.split('|').collect {|pos| pos.to_i} }
74
+ info.coding_sequences=[]
75
+ info.gene_id
76
+ if ( positions.length < 1 )
77
+ raise EncodingError
78
+ elsif ( positions.length > 1)
79
+ info.coding_sequences = positions[1..-1]
80
+ end
81
+
82
+ info.start = positions[0][0]
83
+ info.end = positions[0][1]
84
+
85
+ info.scaffold=fasta_entry.entry_id.scan(/(scaffold_?\d+)_/)[0][0]
86
+ info.name = fasta_entry.entry_id.scan(/lcl\|(.*)/)[0][0]
87
+
88
+ if fasta_entry.entry_id =~ /frame/
89
+ info.frame=info.name.scan(/frame_(\d)/)[0][0]
90
+ info.strand = (info.frame.to_i > 3) ? '-' : '+'
91
+ info.is_sixframe = true
92
+ else
93
+ info.strand = (info.name =~ /rev/) ? '-' : '+'
94
+ info.gene_id=info.name.scan(/_\w{3}_(.*)\.t/)[0][0]
95
+ info.is_sixframe = false
96
+ end
97
+ info
98
+ end
99
+
100
+
101
+ def is_new_genome_location(candidate_entry,existing_entries)
102
+ # puts existing_entries
103
+ # require 'debugger';debugger
104
+
105
+ # genes=existing_entries.collect { |e| e.gene_id }.compact
106
+
107
+ # if genes.include?(candidate_entry.gene_id)
108
+ # return false
109
+ # end
110
+
111
+ existing_entries.each do |existing|
112
+ return false if existing.gene_id==candidate_entry.gene_id
113
+ return false if existing.overlap(candidate_entry)
114
+ end
115
+
116
+ return true
117
+ end
118
+
119
+
120
+ def generate_protein_gff(protein_name,entry_info,prot_prob,prot_id)
121
+ prot_qualifiers = {"source" => "MSMS", "score" => prot_prob, "ID" => prot_id}
122
+ prot_attributes = [["ID",prot_id],["Name",entry_info.name]]
123
+ prot_gff_line = Bio::GFF::GFF3::Record.new(seqid = entry_info.scaffold,source="MSMS",feature_type="protein",
124
+ start_position=entry_info.start,end_position=entry_info.end,score=prot_prob,strand=entry_info.strand,frame=nil,attributes=prot_attributes)
125
+ prot_gff_line
126
+ end
127
+
128
+ def get_dna_sequence(protein_info,genomedb)
129
+
130
+ scaffold_sequence = get_fasta_record(protein_info.scaffold,genomedb)
131
+ gene_sequence = scaffold_sequence.naseq.to_s[(protein_info.start-1)..protein_info.end]
132
+
133
+ if ( protein_info.strand == "-")
134
+ gene_sequence = Bio::Sequence::NA.new(gene_sequence).reverse_complement
135
+ end
136
+
137
+ gene_sequence
138
+ end
139
+
140
+ def peptide_is_in_sixframe(pep_seq,gene_seq)
141
+ gs=Bio::Sequence::NA.new(gene_seq)
142
+ (1..6).each do |frame|
143
+ if gs.translate(frame).index(pep_seq)
144
+ return true
145
+ end
146
+ end
147
+ return false
148
+ end
149
+
150
+ def fragment_coords_from_protein_coords(pepstart,pepend,gene_start,gene_end,coding_sequences)
151
+
152
+ sorted_cds = coding_sequences.sort { |a, b| a[0] <=> b[0] }
153
+
154
+
155
+ # Assume positive strand
156
+ pi_start=pepstart*3+gene_start-1
157
+ pi_end=pepend*3+gene_start-1
158
+
159
+ fragments=[]
160
+ p_i = pi_start #Initially we are looking for the first fragment
161
+ finding_start=true
162
+
163
+ sorted_cds.each_with_index do |cds_coords, i|
164
+ cds_start=cds_coords[0]
165
+ cds_end = cds_coords[1]
166
+ if cds_end < p_i # Exon is before index in sequence and doesn't contain p_i
167
+ if sorted_cds.length <= i+1
168
+ require 'debugger';debugger
169
+ end
170
+
171
+ next_coords = sorted_cds[i+1]
172
+ intron_offset = ((next_coords[0]-cds_end)-1)
173
+ p_i+=intron_offset
174
+ pi_end+=intron_offset
175
+ if !finding_start
176
+ # This is a middle exon
177
+ fragments << [cds_start,cds_end]
178
+ end
179
+ else
180
+ if finding_start
181
+
182
+ if ( pi_end <= cds_end) #Whole peptide contained in a single exon
183
+ fragments << [p_i+1,pi_end]
184
+ break;
185
+ end
186
+
187
+
188
+ fragments << [p_i+1,(cds_end)]
189
+ next_coords = sorted_cds[i+1]
190
+ intron_offset = ((next_coords[0]-cds_end)-1)
191
+ p_i+=intron_offset
192
+ pi_end+=intron_offset
193
+ p_i = pi_end
194
+ finding_start=false
195
+ else # A terminal exon
196
+ # require 'debugger';debugger
197
+ fragments << [(cds_start),(p_i)]
198
+ break;
199
+ end
200
+ end
201
+ end
202
+ [fragments]
203
+ end
204
+
205
+ # gene_seq should already have been reverse_complemented if on reverse strand
206
+ def get_peptide_coordinates_from_transcript_info(prot_seq,pep_seq,protein_info,gene_seq)
207
+ # if ( peptide_is_in_sixframe(pep_seq,gene_seq))
208
+ # Peptide is in 6-frame but on a predicted transcript
209
+ # return nil
210
+ # else
211
+
212
+ # puts "Found a gap #{protein_info.fasta_id}"
213
+ if protein_info.strand=='-'
214
+ pep_index = prot_seq.reverse.index(pep_seq.reverse)
215
+ if pep_index==nil
216
+ # require 'debugger';debugger
217
+ puts "Warning: Unable to find peptide #{pep_seq} in this protein! #{protein_info}"
218
+ return nil
219
+ end
220
+ pep_start_i = prot_seq.reverse.index(pep_seq.reverse)+1
221
+ # Plus 1 because on reverse stand stop-codon will be at the beginning of the sequence (when read forwards). Need to eliminate it.
222
+ else
223
+ pep_start_i = prot_seq.index(pep_seq)
224
+ if pep_start_i==nil
225
+ # require 'debugger';debugger
226
+ puts "Warning: Unable to find peptide #{pep_seq} in this protein! #{protein_info}"
227
+ return nil
228
+ end
229
+ end
230
+ pep_end_i = pep_start_i+pep_seq.length
231
+
232
+ return fragment_coords_from_protein_coords(pep_start_i,pep_end_i,protein_info.start,protein_info.end,protein_info.coding_sequences)
233
+ # end
234
+ end
235
+
236
+ def get_peptide_coordinates_sixframe(prot_seq,pep_seq,protein_info)
237
+
238
+ if ( protein_info.strand == '-' )
239
+ prot_seq = prot_seq.reverse
240
+ pep_seq = pep_seq.reverse
241
+ end
242
+
243
+ start_indexes = [0]
244
+
245
+ prot_seq.scan /#{pep_seq}/ do |match|
246
+ start_indexes << prot_seq.index(match,start_indexes.last)
247
+ end
248
+ start_indexes.delete_at(0)
249
+
250
+ start_indexes.collect do |si|
251
+ pep_genomic_start = protein_info.start + 3*si
252
+ pep_genomic_end = pep_genomic_start + 3*pep_seq.length - 1
253
+ [[pep_genomic_start,pep_genomic_end]]
254
+ end
255
+
256
+ end
257
+
258
+ # Returns a 4-mer [genomic_start,fragment1_end(or0),frag2_start(or0),genomic_end]
259
+ def get_peptide_coordinates(prot_seq,pep_seq,protein_info,gene_seq)
260
+ if ( protein_info.is_sixframe)
261
+ return get_peptide_coordinates_sixframe(prot_seq,pep_seq,protein_info)
262
+ else
263
+ return get_peptide_coordinates_from_transcript_info(prot_seq,pep_seq,protein_info,gene_seq)
264
+ end
265
+ end
266
+
267
+
268
+ def generate_fragment_gffs_for_coords(coords,protein_info,pep_id,peptide_seq,genomedb,name="fragment")
269
+ scaff = get_fasta_record(protein_info.scaffold,genomedb)
270
+ scaffold_seq = Bio::Sequence::NA.new(scaff.seq)
271
+
272
+ fragment_phase = 0
273
+ ordered_coords= protein_info.strand=='+' ? coords : coords.reverse
274
+ if name=="CDS"
275
+ frag_id="#{pep_id}.fg"
276
+ else
277
+ frag_id="#{pep_id}.sp"
278
+ end
279
+ gff_lines = ordered_coords.collect do |frag_start,frag_end|
280
+ frag_naseq = scaffold_seq[frag_start-1..frag_end-1]
281
+
282
+ begin
283
+ frag_frame = fragment_phase+1
284
+ frag_seq = nil
285
+ if ( protein_info.strand=='-')
286
+ frag_seq = frag_naseq.reverse_complement.translate(frag_frame)
287
+ else
288
+ frag_seq = frag_naseq.translate(frag_frame)
289
+ end
290
+ rescue
291
+ if frag_naseq.length > 1
292
+ puts "Unable to translate #{frag_naseq}"
293
+ # require 'debugger'
294
+ end
295
+ frag_seq="*"
296
+ end
297
+
298
+ fragment_record=Bio::GFF::GFF3::Record.new(seqid = protein_info.scaffold,source="MSMS",
299
+ feature_type=name,start_position=frag_start,end_position=frag_end,score='',
300
+ strand=protein_info.strand,frame=fragment_phase,attributes=[["Parent",pep_id],["ID",frag_id],["Name",frag_seq]])
301
+
302
+
303
+ remainder=(frag_naseq.length-fragment_phase) % 3
304
+ fragment_phase=(3-remainder) % 3
305
+
306
+ fragment_record
307
+ end
308
+
309
+
310
+ concat_seq=nil
311
+
312
+ coords.each do |frag_start,frag_end|
313
+ frag_naseq = scaffold_seq[frag_start-1..frag_end-1]
314
+ concat_seq += frag_naseq unless concat_seq == nil
315
+ concat_seq = frag_naseq if concat_seq==nil
316
+ end
317
+
318
+ check_seq = protein_info.strand=='-' ? concat_seq.reverse_complement.translate : concat_seq.translate
319
+ if ( check_seq != peptide_seq)
320
+ require 'debugger';debugger
321
+ puts "Fragment seqs not equal to peptide seqs"
322
+ end
323
+
324
+ return gff_lines
325
+
326
+ end
327
+
328
+ def get_start_codon_coords_for_peptide(peptide_genomic_start,peptide_genomic_end,peptide_seq,protein_seq,strand)
329
+ pi=protein_seq.index(peptide_seq)
330
+ if ( protein_seq[pi]=='M' )
331
+ is_tryptic=false
332
+ if ( pi>0 && (protein_seq[pi-1]!='K' && protein_seq[pi-1]!='R') )
333
+ is_tryptic=true
334
+ elsif (pi==0)
335
+ is_tryptic=true
336
+ end
337
+ return nil unless is_tryptic
338
+
339
+ start_codon_coord = (strand=='+') ? peptide_genomic_start : peptide_genomic_end-2
340
+ # require 'debugger';debugger
341
+ return [start_codon_coord,start_codon_coord+2]
342
+ else
343
+ return nil
344
+ end
345
+ end
346
+
347
+ def get_cterm_coords_for_peptide(peptide_genomic_start,peptide_genomic_end,peptide_seq,protein_seq,strand)
348
+
349
+ if ( (peptide_seq[-1]!='K' && peptide_seq[-1]!='R' ) )
350
+
351
+ codon_coord = (strand=='+') ? peptide_genomic_end-3 : peptide_genomic_start+1
352
+ # require 'debugger';debugger
353
+ return [codon_coord,codon_coord+2]
354
+ else
355
+ return nil
356
+ end
357
+ end
358
+
359
+
360
+ def get_nterm_peptide_for_peptide(peptide_seq,protein_seq)
361
+ pi=protein_seq.index(peptide_seq)
362
+ if ( pi>0 && (protein_seq[pi-1]!='K' && protein_seq[pi-1]!='R' && protein_seq[pi]!='M') )
363
+ # Since trypsin sometimes cleaves before P (ie breaking the rule)
364
+ # we don't check for it and assume those cases are real tryptic termini
365
+ reverse_leader_seq=protein_seq[0..pi].reverse
366
+ mi=reverse_leader_seq.index('M')
367
+
368
+ if ( mi==nil )
369
+ puts "No methionine found ahead of peptide sequence. Unable to determine n-term sequence"
370
+ return nil
371
+ end
372
+
373
+ mi=pi-mi
374
+
375
+ ntermseq=protein_seq[mi..(pi-1)]
376
+
377
+ # if ( ntermseq.length < minlen )
378
+ # return nil
379
+ # end
380
+
381
+ # $STDOUT.write protein_seq[mi..(pi+peptide_seq.length-1)]
382
+ # require 'debugger';debugger
383
+ full_seq_with_annotations = "#{ntermseq}(cleaved)#{protein_seq[(pi..(pi+peptide_seq.length-1))]}"
384
+
385
+ return full_seq_with_annotations
386
+ else
387
+ return nil
388
+ end
389
+ end
390
+
391
+ def generate_gff_for_peptide_mapped_to_protein(protein_seq,peptide_seq,protein_info,prot_id,peptide_prob,peptide_count,dna_sequence,genomedb=nil)
392
+
393
+ prot_seq = protein_seq
394
+ pep_seq = peptide_seq
395
+
396
+
397
+ peptide_coords = get_peptide_coordinates(prot_seq,pep_seq,protein_info,dna_sequence)
398
+
399
+ if ( peptide_coords==nil ) # Return value of nil means the entry is a predicted transcript that should already be covered by 6-frame
400
+ return []
401
+ end
402
+
403
+ gff_records=[]
404
+
405
+ # Now convert peptide coordinate to genome coordinates
406
+ # And create gff lines for each match
407
+ peptide_coords.each do |coords|
408
+
409
+ # require 'debugger';debugger
410
+ pep_genomic_start = coords.first[0]
411
+ pep_genomic_end = coords.last[1]
412
+
413
+ pep_id = "#{prot_id}.p#{peptide_count.to_s}"
414
+ pep_attributes = [["ID",pep_id],["Parent",prot_id],["Name",pep_seq]]
415
+
416
+ pep_gff_line = Bio::GFF::GFF3::Record.new(seqid = protein_info.scaffold,source="MSMS",
417
+ feature_type="peptide",start_position=pep_genomic_start,end_position=pep_genomic_end,score=peptide_prob,
418
+ strand=protein_info.strand,frame=nil,attributes=pep_attributes)
419
+
420
+ # For standard peptides
421
+ frag_gffs = generate_fragment_gffs_for_coords(coords,protein_info,pep_id,peptide_seq,genomedb,"CDS")
422
+ gff_records += [pep_gff_line] + frag_gffs
423
+ # require 'debugger';debugger
424
+ # For peptides with only 1 tryptic terminus
425
+ start_codon_coords=get_start_codon_coords_for_peptide(pep_genomic_start,pep_genomic_end,peptide_seq,protein_seq,protein_info.strand)
426
+ if start_codon_coords
427
+ start_codon_gff = Bio::GFF::GFF3::Record.new(seqid = protein_info.scaffold,source="MSMS",
428
+ feature_type="start_codon",start_position=start_codon_coords[0],end_position=start_codon_coords[1],score='',
429
+ strand=protein_info.strand,frame=nil,attributes=["Parent",pep_id])
430
+ gff_records+=[start_codon_gff]
431
+ end
432
+
433
+ cterm_coords = get_cterm_coords_for_peptide(pep_genomic_start,pep_genomic_end,peptide_seq,protein_seq,protein_info.strand)
434
+ if ( cterm_coords )
435
+ cterm_gff = Bio::GFF::GFF3::Record.new(seqid = protein_info.scaffold,source="MSMS",
436
+ feature_type="cterm",start_position=cterm_coords[0],end_position=cterm_coords[1],score='',
437
+ strand=protein_info.strand,frame=nil,attributes=["Parent",pep_id])
438
+ gff_records+=[start_codon_gff]
439
+ end
440
+
441
+ end
442
+ # puts gff_records
443
+
444
+ gff_records
445
+ end
446
+
447
+ def add_putative_nterm_to_gff(gff_records,peptide_seq,protein_seq,protein_info,prot_id,peptide_count,dna_sequence,genomedb)
448
+ pep_id = "#{prot_id}.p#{peptide_count.to_s}"
449
+ signal_peptide = get_nterm_peptide_for_peptide(peptide_seq,protein_seq)
450
+ if signal_peptide
451
+ $stdout.write "Nterm\t#{signal_peptide}\t#{protein_info.name}\t#{protein_seq}\n"
452
+ raw_signal_peptide=signal_peptide.sub(/\(cleaved\)/,"")
453
+ # Get raw signal_peptide sequence
454
+
455
+ signal_peptide_coords=get_peptide_coordinates(protein_seq,raw_signal_peptide,protein_info,dna_sequence)
456
+ if signal_peptide_coords
457
+ signal_peptide_coords.each do |spcoords|
458
+ signal_peptide_gff = generate_fragment_gffs_for_coords(spcoords,protein_info,pep_id,raw_signal_peptide,genomedb,"signalpeptide")
459
+ gff_records += signal_peptide_gff
460
+ end
461
+ end
462
+ end
463
+ end
464
+
465
+ def peptide_gff_is_duplicate(peptide_gff,peptides_covered_genome)
466
+ nameindex = peptide_gff.attributes.index {|obj| obj[0]=="Name" }
467
+ pep_seq = peptide_gff.attributes[nameindex][1]
468
+ existing = peptides_covered_genome[pep_seq]
469
+ return true if existing==peptide_gff.start
470
+
471
+ return false
472
+ end
473
+
474
+ end