mspire 0.8.4 → 0.8.5

Sign up to get free protection for your applications and to get access to all the features.
data/README.md CHANGED
@@ -19,7 +19,7 @@ Prince JT, Marcotte EM. <b>mspire: mass spectrometry proteomics in Ruby.</b> *Bi
19
19
 
20
20
  ### imzml
21
21
 
22
- Mspire is the *only* converter from mzml into imzml.
22
+ Mspire is the *only* commandline converter from mzml into imzml (also see [imzMLConverter](http://www.cs.bham.ac.uk/~ibs/imzMLConverter/))
23
23
 
24
24
  * handles both processed and continuous modes
25
25
  * gracefully handles SIM data
data/VERSION CHANGED
@@ -1 +1 @@
1
- 0.8.4
1
+ 0.8.5
data/lib/mspire/fasta.rb CHANGED
@@ -58,33 +58,11 @@ module Mspire
58
58
  Bio::FlatFile.new(Bio::FastaFormat, io)
59
59
  end
60
60
 
61
- =begin
62
- # returns two hashes [id_to_length, id_to_description]
63
- # faster (~4x) than official route.
64
- def self.protein_lengths_and_descriptions(file)
65
- protid_to_description = {}
66
- protid_to_length = {}
67
- re = /^>([^\s]+) (.*)/
68
- ids = []
69
- lengths = []
70
- current_length = nil
71
- IO.foreach(file) do |line|
72
- line.chomp!
73
- if md=re.match(line)
74
- lengths << current_length
75
- current_id = md[1]
76
- ids << current_id
77
- current_length = 0
78
- protid_to_description[current_id] = md[2]
79
- else
80
- current_length += line.size
81
- end
82
- end
83
- lengths << current_length
84
- lengths.shift # remove the first nil entry
85
- [Hash[ids.zip(lengths).to_a], protid_to_description]
61
+ # takes the header string and returns the uniprot id
62
+ #
63
+ # 'sp|Q04917|1433F_HUMAN' #=> 'Q04917'
64
+ def self.uniprot_id(header)
65
+ header[/^[^\|]+\|([^\|]+)\|/, 1]
86
66
  end
87
- =end
88
-
89
67
  end
90
68
  end
@@ -0,0 +1,248 @@
1
+ require 'optparse'
2
+ require 'mspire/digester'
3
+ require 'mspire/fasta'
4
+ require 'mspire/ident/peptide/db'
5
+
6
+ class Mspire::Ident::Peptide::Db::Creator
7
+ MAX_NUM_AA_EXPANSION = 3
8
+
9
+ # the twenty standard amino acids
10
+ STANDARD_AA = %w(A C D E F G H I K L M N P Q R S T V W Y)
11
+ EXPAND_AA = {'X' => STANDARD_AA}
12
+
13
+ DEFAULT_PEPTIDE_CENTRIC_DB = {
14
+ missed_cleavages: 2,
15
+ min_length: 4,
16
+ enzyme: Mspire::Digester[:trypsin],
17
+ remove_digestion_file: true,
18
+ cleave_initiator_methionine: true,
19
+ expand_aa: true,
20
+ uniprot: true
21
+ }
22
+
23
+ def self.cmdline(argv)
24
+
25
+ opt = {
26
+ :remove_digestion_file => true,
27
+ :enzyme => Mspire::Digester[:trypsin]
28
+ }
29
+ opts = OptionParser.new do |op|
30
+ op.banner = "usage: #{File.basename($0)} <file>.fasta ..."
31
+ op.separator "output: "
32
+ op.separator " <file>.msd_clvg<missed_cleavages>.min_aaseq<min_length>.yml"
33
+ op.separator "format:"
34
+ op.separator " PEPTIDE: ID1<tab>ID2<tab>ID3..."
35
+ op.separator ""
36
+ op.separator " Initiator Methionines - by default, will generate two peptides"
37
+ op.separator " for any peptide found at the N-termini starting with 'M'"
38
+ op.separator " (i.e., one with and one without the leading methionine)"
39
+ op.separator ""
40
+ op.on("--missed-cleavages <#{opt[:missed_cleavages]}>", Integer, "max num of missed cleavages") {|v| opt[:missed_cleavages] = v }
41
+ op.on("--min-length <#{opt[:min_length]}>", Integer, "the minimum peptide aaseq length") {|v| opt[:min_length] = v }
42
+ op.on("--no-cleaved-methionine", "does not cleave off initiator methionine") { opt[:cleave_initiator_methionine] = false }
43
+ op.on("--no-expand-x", "don't enumerate aa possibilities", "(removes these peptides)") { opt[:expand_aa] = false }
44
+ op.on("--no-uniprot", "use entire protid section of fasta header", "for non-uniprot fasta files") { opt[:uniprot] = false }
45
+ op.on("--trie", "use a trie (for very large uniprot files)", "must have fast_trie gem installed") {|v| opt[:trie] = v }
46
+ op.on("-e", "--enzyme <name>", "enzyme for digestion") {|v| opt[:enzyme] = Mspire::Insilico::Digester.const_get(v.upcase) }
47
+ op.on("--list-enzymes", "lists approved enzymes and exits") do
48
+ puts Mspire::Digester::ENZYMES.keys.join("\n")
49
+ exit
50
+ end
51
+ end
52
+
53
+ opts.parse!(argv)
54
+
55
+ if argv.size == 0
56
+ puts opts || exit
57
+ end
58
+
59
+ argv.map do |file|
60
+ creator = Mspire::Ident::Peptide::Db::Creator.new
61
+ creator.create(file, opt)
62
+ end
63
+ end
64
+
65
+ # returns the name of the digestion file that was written
66
+ def create_digestion_file(fasta_file, opts={})
67
+ opts = DEFAULT_PEPTIDE_CENTRIC_DB.merge(opts)
68
+
69
+ (missed_cleavages, enzyme, cleave_initiator_methionine, expand_aa) = opts.values_at(:missed_cleavages, :enzyme, :cleave_initiator_methionine, :expand_aa)
70
+ start_time = Time.now
71
+ print "Digesting #{fasta_file} ..." if $VERBOSE
72
+
73
+ letters_to_expand_re = Regexp.new("[" << Regexp.escape(EXPAND_AA.keys.join) << "]")
74
+
75
+ base = fasta_file.chomp(File.extname(fasta_file))
76
+ digestion_file = base + ".msd_clvg#{missed_cleavages}.peptides"
77
+ File.open(digestion_file, "w") do |fh|
78
+ Mspire::Fasta.open(fasta_file) do |fasta|
79
+ fasta.each do |prot|
80
+ peptides = enzyme.digest(prot.sequence, missed_cleavages)
81
+ if (cleave_initiator_methionine && (prot.sequence[0,1] == "M"))
82
+ m_peps = []
83
+ init_methionine_peps = []
84
+ peptides.each do |pep|
85
+ # if the peptide is at the beginning of the protein sequence
86
+ if prot.sequence[0,pep.size] == pep
87
+ m_peps << pep[1..-1]
88
+ end
89
+ end
90
+ peptides.push(*m_peps)
91
+ end
92
+ peptides =
93
+ if expand_aa
94
+ peptides.flat_map do |pep|
95
+ (pep =~ letters_to_expand_re) ? expand_peptides(pep, EXPAND_AA) : pep
96
+ end
97
+ else
98
+ peptides.map {|pep| pep =~ letters_to_expand_re }.compact
99
+ end
100
+ header = prot.header
101
+ id = opts[:uniprot] ? Mspire::Fasta.uniprot_id(header) : header.split(/\s+/).first
102
+ fh.puts( id + "\t" + peptides.join(" ") )
103
+ end
104
+ end
105
+ end
106
+ puts "#{Time.now - start_time} sec" if $VERBOSE
107
+ digestion_file
108
+ end
109
+
110
+ # returns the full path of the created file
111
+ def db_from_fasta_digestion_file(digestion_file, opts={})
112
+ opts = DEFAULT_PEPTIDE_CENTRIC_DB.merge(opts)
113
+
114
+ start_time = Time.now
115
+ puts "Organizing raw digestion #{digestion_file} ..." if $VERBOSE
116
+
117
+ puts "#{Time.now - start_time} sec" if $VERBOSE
118
+ hash_like = hash_like_from_digestion_file(digestion_file, opts[:min_length], opts[:trie])
119
+
120
+ base = digestion_file.chomp(File.extname(digestion_file))
121
+ final_outfile = base + ".min_aaseq#{opts[:min_length]}" + ".yml"
122
+
123
+ start_time = Time.now
124
+ print "Writing #{hash_like.size} peptides to #{} ..." if $VERBOSE
125
+
126
+ File.open(final_outfile, 'w') do |out|
127
+ hash_like.each do |k,v|
128
+ #out.puts( [k, v.join(Mspire::Ident::Peptide::Db::PROTEIN_DELIMITER)].join(Mspire::Ident::Peptide::Db::KEY_VALUE_DELIMITER) )
129
+ out.puts "#{k}#{Mspire::Ident::Peptide::Db::KEY_VALUE_DELIMITER}#{v}"
130
+ end
131
+ end
132
+ puts "#{Time.now - start_time} sec" if $VERBOSE
133
+
134
+ if opts[:remove_digestion_file]
135
+ File.unlink(digestion_file)
136
+ end
137
+ File.expand_path(final_outfile)
138
+ end
139
+
140
+ def hash_like_tree
141
+ require 'trie'
142
+ trie = Trie.new
143
+ def trie.[](key)
144
+ val = self.get(key)
145
+ if val.nil?
146
+ self.add(key,"")
147
+ self.get(key)
148
+ else
149
+ val
150
+ end
151
+ end
152
+ trie
153
+ end
154
+
155
+ def hash_like_from_digestion_file(digestion_file, min_length, use_trie=false)
156
+ cnt = 0
157
+ if use_trie
158
+ raise NotImplementedError
159
+ #puts "using trie" if $VERBOSE
160
+ #trie = hash_like_tree
161
+ #line_cnt = 0
162
+ #::IO.foreach(digestion_file) do |line|
163
+ #line_cnt += 1
164
+ ##puts "LINE COUND"
165
+ ##p line_cnt
166
+ #(prot, *peps) = line.chomp!.split(/\s+/)
167
+ ##p peps
168
+ ##p peps.class
169
+ ## prot is something like this: "P31946"
170
+ #puts line
171
+ #peps.each do |pep|
172
+ #if pep.size >= min_length
173
+ #to_set =
174
+ #if val = trie.get(pep)
175
+ #val + Mspire::Ident::Peptide::Db::PROTEIN_DELIMITER + prot
176
+ #else
177
+ #prot
178
+ #end
179
+ #p to_set.size
180
+ #trie.add(pep, to_set)
181
+ #end
182
+ #end
183
+ #cnt += 1
184
+ #puts cnt if (cnt % 1000) == 0
185
+ #end
186
+ #abort "HERE"
187
+ #trie
188
+ else
189
+ hash = {}
190
+ ::IO.foreach(digestion_file) do |line|
191
+ (prot, *peps) = line.chomp!.split(/\s+/)
192
+ # prot is something like this: "P31946"
193
+ peps.each do |pep|
194
+ if pep.size >= min_length
195
+ if val = hash[pep]
196
+ val << Mspire::Ident::Peptide::Db::PROTEIN_DELIMITER << prot
197
+ else
198
+ val = prot
199
+ end
200
+ hash[pep] = val
201
+ end
202
+ end
203
+ cnt += 1
204
+ puts cnt if (cnt % 1000) == 0
205
+ end
206
+ hash
207
+ end
208
+ end
209
+
210
+ # writes a new file with the added 'min_aaseq<Integer>'
211
+ # creates a temporary digestion file that contains all peptides digesting
212
+ # with certain missed_cleavages (i.e., min_seq_length is not applied to
213
+ # this file but on the final peptide centric db)
214
+ # returns the full name of the written file.
215
+ def create(fasta_file, opts={})
216
+ opts = DEFAULT_PEPTIDE_CENTRIC_DB.merge(opts)
217
+ digestion_file = create_digestion_file(fasta_file, opts)
218
+ db_from_fasta_digestion_file(digestion_file, opts)
219
+ end
220
+
221
+ # does combinatorial expansion of all letters requesting it.
222
+ # expand_aa is hash like: {'X'=>STANDARD_AA}
223
+ # returns nil if there are more than MAX_NUM_AA_EXPANSION amino acids to
224
+ # be expanded
225
+ # returns an empty array if there is no expansion
226
+ def expand_peptides(peptide, expand_aa_hash)
227
+ letters_in_order = expand_aa_hash.keys.sort
228
+ index_and_key = []
229
+ peptide.split('').each_with_index do |char,i|
230
+ if let_index = letters_in_order.index(char)
231
+ index_and_key << [i, letters_in_order[let_index]]
232
+ end
233
+ end
234
+ if index_and_key.size > MAX_NUM_AA_EXPANSION
235
+ return nil
236
+ end
237
+ to_expand = [peptide]
238
+ index_and_key.each do |i,letter|
239
+ new_peps = []
240
+ while current_pep = to_expand.shift do
241
+ new_peps << expand_aa_hash[letter].map {|v| dp = current_pep.dup ; dp[i] = v ; dp }
242
+ end
243
+ to_expand = new_peps.flatten
244
+ end
245
+ to_expand
246
+ end
247
+ end
248
+
@@ -0,0 +1,62 @@
1
+ require 'mspire/ident/peptide/db'
2
+
3
+ class Mspire::Ident::Peptide::Db::IO
4
+ # an object for on disk retrieval of db entries
5
+ # proteins are returned as an array.
6
+ # behaves like a hash once it is opened.
7
+ include Enumerable
8
+ def self.open(filename, &block)
9
+ raise ArgumentError unless block
10
+ File.open(filename) do |io|
11
+ block.call(self.new(io))
12
+ end
13
+ end
14
+
15
+ attr_accessor :io
16
+ attr_accessor :index
17
+
18
+ def initialize(io)
19
+ @io = io
20
+ @index = {}
21
+ re = /^(\w+)#{Regexp.escape(Mspire::Ident::Peptide::Db::KEY_VALUE_DELIMITER)}/
22
+ prev_io_pos = io.pos
23
+ triplets = io.each_line.map do |line|
24
+ key = re.match(line)[1]
25
+ [key, prev_io_pos + key.bytesize+Mspire::Ident::Peptide::Db::KEY_VALUE_DELIMITER.bytesize, prev_io_pos=io.pos]
26
+ end
27
+ triplets.each do |key, start, end_pos|
28
+ @index[key] = [start, end_pos-start]
29
+ end
30
+ end
31
+
32
+ # returns an array of proteins for the given key (peptide aaseq)
33
+ def [](key)
34
+ (start, length) = @index[key]
35
+ return nil unless start
36
+ @io.seek(start)
37
+ string = @io.read(length)
38
+ string.chomp!
39
+ string.split(Mspire::Ident::Peptide::Db::PROTEIN_DELIMITER)
40
+ end
41
+
42
+ # number of entries
43
+ def size ; @index.size end
44
+ alias_method :length, :size
45
+
46
+ def keys
47
+ @index.keys
48
+ end
49
+
50
+ # all the protein lists
51
+ def values
52
+ keys.map {|key| self[key] }
53
+ end
54
+
55
+ # yields a pair of aaseq and protein array
56
+ def each(&block)
57
+ @index.each do |key, start_length|
58
+ block.call([key, self[key]])
59
+ end
60
+ end
61
+ end
62
+
@@ -1,243 +1,36 @@
1
- require 'mspire/digester'
2
- require 'mspire/fasta'
3
- require 'optparse'
1
+ require 'yaml'
4
2
 
5
3
  module Mspire ; end
6
4
  module Mspire::Ident ; end
7
5
  module Mspire::Ident::Peptide ; end
8
6
 
9
- # the object itself is a modified Hash.
10
- # It is initialized with the database file and a protein array can be
11
- # retrieved with the #[] method given an amino acid sequence. All other
12
- # methods are untested at this time and should be avoided!
13
- class Mspire::Ident::Peptide::Db < Hash
14
- MAX_NUM_AA_EXPANSION = 3
15
-
16
- # the twenty standard amino acids
17
- STANDARD_AA = %w(A C D E F G H I K L M N P Q R S T V W Y)
18
-
19
- DEFAULT_PEPTIDE_CENTRIC_DB = {:missed_cleavages => 2, :min_length => 4, :enzyme => Mspire::Digester[:trypsin], :id_regexp => nil, :remove_digestion_file => true, :cleave_initiator_methionine => true, :expand_aa => {'X' => STANDARD_AA}}
20
-
7
+ # Very simple object for protein retrieval from a peptide-centric database
8
+ # See Mspire::Ident::Peptide::Db::IO for an on-disc version for larger files.
9
+ class Mspire::Ident::Peptide::Db
21
10
  PROTEIN_DELIMITER = "\t"
22
- KEY_VALUE_DELIMITER = ": "
23
-
24
- def self.cmdline(argv)
25
-
26
- opt = {
27
- :remove_digestion_file => true,
28
- :enzyme => Mspire::Digester[:trypsin]
29
- }
30
- opts = OptionParser.new do |op|
31
- op.banner = "usage: #{File.basename($0)} <file>.fasta ..."
32
- op.separator "output: "
33
- op.separator " <file>.msd_clvg<missed_cleavages>.min_aaseq<min_length>.yml"
34
- op.separator "format:"
35
- op.separator " PEPTIDE: ID1<tab>ID2<tab>ID3..."
36
- op.separator ""
37
- op.separator " Initiator Methionines - by default, will generate two peptides"
38
- op.separator " for any peptide found at the N-termini starting with 'M'"
39
- op.separator " (i.e., one with and one without the leading methionine)"
40
- op.separator ""
41
- op.on("--missed-cleavages <#{opt[:missed_cleavages]}>", Integer, "max num of missed cleavages") {|v| opt[:missed_cleavages] = v }
42
- op.on("--min-length <#{opt[:min_length]}>", Integer, "the minimum peptide aaseq length") {|v| opt[:min_length] = v }
43
- op.on("--no-cleaved-methionine", "does not cleave off initiator methionine") { opt[:cleave_initiator_methionine] = false }
44
- op.on("--no-expand-x", "don't enumerate aa 'X' possibilities") { opt[:expand_aa] = nil }
45
- op.on("-e", "--enzyme <name>", "enzyme for digestion") {|v| opt[:enzyme] = Mspire::Insilico::Digester.const_get(v.upcase) }
46
- op.on("--list-enzymes", "lists approved enzymes and exits") do
47
- puts Mspire::Digester::ENZYMES.keys.join("\n")
48
- exit
49
- end
50
- end
51
-
52
- opts.parse!(argv)
53
-
54
- if argv.size == 0
55
- puts opts || exit
56
- end
57
-
58
- argv.map do |file|
59
- Mspire::Ident::Peptide::Db.peptide_centric_db(file, opt)
60
- end
61
- end
62
-
63
- # writes a new file with the added 'min_aaseq<Integer>'
64
- # creates a temporary digestion file that contains all peptides digesting
65
- # with certain missed_cleavages (i.e., min_seq_length is not applied to
66
- # this file but on the final peptide centric db)
67
- # returns the full name of the written file.
68
- def self.peptide_centric_db(fasta_file, opts={})
69
- opts = DEFAULT_PEPTIDE_CENTRIC_DB.merge(opts)
70
-
71
- (missed_cleavages, min_length, enzyme, id_regexp, remove_digestion_file, cleave_initiator_methionine, expand_aa) = opts.values_at(:missed_cleavages, :min_length, :enzyme, :id_regexp, :remove_digestion_file, :cleave_initiator_methionine, :expand_aa)
72
- start_time = Time.now
73
- print "Digesting #{fasta_file} ..." if $VERBOSE
74
-
75
- if expand_aa
76
- letters_to_expand_re = Regexp.new("[" << Regexp.escape(expand_aa.keys.join) << "]")
77
- end
78
-
79
- base = fasta_file.chomp(File.extname(fasta_file))
80
- digestion_file = base + ".msd_clvg#{missed_cleavages}.peptides"
81
- File.open(digestion_file, "w") do |fh|
82
- Mspire::Fasta.open(fasta_file) do |fasta|
83
- fasta.each do |prot|
84
- peptides = enzyme.digest(prot.sequence, missed_cleavages)
85
- if (cleave_initiator_methionine && (prot.sequence[0,1] == "M"))
86
- m_peps = []
87
- init_methionine_peps = []
88
- peptides.each do |pep|
89
- # if the peptide is at the beginning of the protein sequence
90
- if prot.sequence[0,pep.size] == pep
91
- m_peps << pep[1..-1]
92
- end
93
- end
94
- peptides.push(*m_peps)
95
- end
96
- if expand_aa
97
- peptides = peptides.map do |pep|
98
- if pep =~ letters_to_expand_re
99
- expand_peptides(pep, expand_aa)
100
- else
101
- pep
102
- end
103
- end.flatten
104
- end
105
- fh.puts( prot.header.split(/\s+/).first + "\t" + peptides.join(" ") )
106
- end
107
- end
108
- end
109
- puts "#{Time.now - start_time} sec" if $VERBOSE
110
-
111
-
112
- start_time = Time.now
113
- print "Organizing raw digestion #{digestion_file} ..." if $VERBOSE
11
+ KEY_VALUE_DELIMITER = ': '
114
12
 
115
- hash = Hash.new {|h,k| h[k] = [] }
116
- ::IO.foreach(digestion_file) do |line|
117
- (prot, *peps) = line.chomp!.split(/\s+/)
118
- # prot is something like this: "sp|P31946|1433B_HUMAN" in uniprot
119
- peps.each do |pep|
120
- if pep.size >= min_length
121
- hash[pep] << prot
122
- end
123
- end
124
- end
125
- puts "#{Time.now - start_time} sec" if $VERBOSE
126
-
127
- base = digestion_file.chomp(File.extname(digestion_file))
128
- final_outfile = base + ".min_aaseq#{min_length}" + ".yml"
129
-
130
- start_time = Time.now
131
- print "Writing #{hash.size} peptides to #{} ..." if $VERBOSE
132
-
133
- File.open(final_outfile, 'w') do |out|
134
- hash.each do |k,v|
135
- out.puts( [k, v.join(PROTEIN_DELIMITER)].join(KEY_VALUE_DELIMITER) )
136
- end
137
- end
138
- puts "#{Time.now - start_time} sec" if $VERBOSE
139
-
140
- if remove_digestion_file
141
- File.unlink(digestion_file)
142
- end
143
- File.expand_path(final_outfile)
144
- end
145
-
146
- # does combinatorial expansion of all letters requesting it.
147
- # expand_aa is hash like: {'X'=>STANDARD_AA}
148
- # returns nil if there are more than MAX_NUM_AA_EXPANSION amino acids to
149
- # be expanded
150
- # returns an empty array if there is no expansion
151
- def self.expand_peptides(peptide, expand_aa)
152
- letters_in_order = expand_aa.keys.sort
153
- index_and_key = []
154
- peptide.split('').each_with_index do |char,i|
155
- if let_index = letters_in_order.index(char)
156
- index_and_key << [i, letters_in_order[let_index]]
157
- end
158
- end
159
- if index_and_key.size > MAX_NUM_AA_EXPANSION
160
- return nil
161
- end
162
- to_expand = [peptide]
163
- index_and_key.each do |i,letter|
164
- new_peps = []
165
- while current_pep = to_expand.shift do
166
- new_peps << expand_aa[letter].map {|v| dp = current_pep.dup ; dp[i] = v ; dp }
167
- end
168
- to_expand = new_peps.flatten
169
- end
170
- to_expand
171
- end
13
+ attr_accessor :data
172
14
 
173
15
  def initialize(db_file)
174
- self.replace(YAML.load_file(db_file))
16
+ @data = YAML.load_file(db_file)
175
17
  end
176
18
 
177
- alias_method :old_bracket, '[]'.to_sym
178
-
179
- # returns the protein id's as an array
19
+ # returns protein id's as an array
180
20
  def [](key)
181
- old_bracket(key).chomp.split(PROTEIN_DELIMITER)
21
+ val=@data[key]
22
+ val.chomp.split(PROTEIN_DELIMITER) if val
182
23
  end
183
24
 
184
- # an object for on disk retrieval of db entries
185
- # proteins are returned as an array.
186
- # behaves much like a hash once it is opened.
187
- class IO
188
- include Enumerable
189
- def self.open(filename, &block)
190
- raise ArgumentError unless block
191
- File.open(filename) do |io|
192
- block.call(self.new(io))
193
- end
194
- end
195
-
196
- attr_accessor :io
197
- attr_accessor :index
198
-
199
- def initialize(io)
200
- @io = io
201
- @index = {}
202
- re = /^(\w+)#{Regexp.escape(KEY_VALUE_DELIMITER)}/
203
- prev_io_pos = io.pos
204
- triplets = io.each_line.map do |line|
205
- key = re.match(line)[1]
206
- [key, prev_io_pos + key.bytesize+KEY_VALUE_DELIMITER.bytesize, prev_io_pos=io.pos]
207
- end
208
- triplets.each do |key, start, end_pos|
209
- @index[key] = [start, end_pos-start]
210
- end
211
- end
212
-
213
- # returns an array of proteins for the given key (peptide aaseq)
214
- def [](key)
215
- (start, length) = @index[key]
216
- return nil unless start
217
- @io.seek(start)
218
- string = @io.read(length)
219
- string.chomp!
220
- string.split("\t")
221
- end
222
-
223
- # number of entries
224
- def size ; @index.size end
225
- alias_method :length, :size
226
-
227
- def keys
228
- @index.keys
229
- end
25
+ def keys
26
+ @data.keys
27
+ end
230
28
 
231
- # all the protein lists
232
- def values
233
- keys.map {|key| self[key] }
234
- end
29
+ def values
30
+ @data.values
31
+ end
235
32
 
236
- # yields a pair of aaseq and protein array
237
- def each(&block)
238
- @index.each do |key, start_length|
239
- block.call([key, self[key]])
240
- end
241
- end
33
+ def size
34
+ @data.size
242
35
  end
243
36
  end
@@ -19,12 +19,13 @@ class Mspire::Ident::PeptideHit
19
19
 
20
20
  # writes the peptide hits to a phq.tsv file. qvalues is a parallel array
21
21
  # to hits that can provide qvalues if not inherent to the hits
22
- # returns the filename.
22
+ # returns the filename. Expects each hit to implement #search_id, #id,
23
+ # #aaseq and #charge
23
24
  def to_file(filename, hits, qvalues=[])
24
25
  File.open(filename,'w') do |out|
25
26
  out.puts HEADER.join(FILE_DELIMITER)
26
27
  hits.zip(qvalues) do |hit, qvalue|
27
- out.puts [hit.search.id, hit.id, hit.aaseq, hit.charge, qvalue || hit.qvalue].join(FILE_DELIMITER)
28
+ out.puts [hit.search_id, hit.id, hit.aaseq, hit.charge, qvalue || hit.qvalue].join(FILE_DELIMITER)
28
29
  end
29
30
  end
30
31
  filename
@@ -0,0 +1,5 @@
1
+ # structure of a very simple file for holding peptide hit qvalues
2
+ # entries should be separated by a tab!!!
3
+ aaseq charge qvalue
4
+ String Integer Float
5
+ ... ... ...