mspire 0.8.4 → 0.8.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/README.md CHANGED
@@ -19,7 +19,7 @@ Prince JT, Marcotte EM. <b>mspire: mass spectrometry proteomics in Ruby.</b> *Bi
19
19
 
20
20
  ### imzml
21
21
 
22
- Mspire is the *only* converter from mzml into imzml.
22
+ Mspire is the *only* commandline converter from mzml into imzml (also see [imzMLConverter](http://www.cs.bham.ac.uk/~ibs/imzMLConverter/))
23
23
 
24
24
  * handles both processed and continuous modes
25
25
  * gracefully handles SIM data
data/VERSION CHANGED
@@ -1 +1 @@
1
- 0.8.4
1
+ 0.8.5
data/lib/mspire/fasta.rb CHANGED
@@ -58,33 +58,11 @@ module Mspire
58
58
  Bio::FlatFile.new(Bio::FastaFormat, io)
59
59
  end
60
60
 
61
- =begin
62
- # returns two hashes [id_to_length, id_to_description]
63
- # faster (~4x) than official route.
64
- def self.protein_lengths_and_descriptions(file)
65
- protid_to_description = {}
66
- protid_to_length = {}
67
- re = /^>([^\s]+) (.*)/
68
- ids = []
69
- lengths = []
70
- current_length = nil
71
- IO.foreach(file) do |line|
72
- line.chomp!
73
- if md=re.match(line)
74
- lengths << current_length
75
- current_id = md[1]
76
- ids << current_id
77
- current_length = 0
78
- protid_to_description[current_id] = md[2]
79
- else
80
- current_length += line.size
81
- end
82
- end
83
- lengths << current_length
84
- lengths.shift # remove the first nil entry
85
- [Hash[ids.zip(lengths).to_a], protid_to_description]
61
+ # takes the header string and returns the uniprot id
62
+ #
63
+ # 'sp|Q04917|1433F_HUMAN' #=> 'Q04917'
64
+ def self.uniprot_id(header)
65
+ header[/^[^\|]+\|([^\|]+)\|/, 1]
86
66
  end
87
- =end
88
-
89
67
  end
90
68
  end
@@ -0,0 +1,248 @@
1
+ require 'optparse'
2
+ require 'mspire/digester'
3
+ require 'mspire/fasta'
4
+ require 'mspire/ident/peptide/db'
5
+
6
+ class Mspire::Ident::Peptide::Db::Creator
7
+ MAX_NUM_AA_EXPANSION = 3
8
+
9
+ # the twenty standard amino acids
10
+ STANDARD_AA = %w(A C D E F G H I K L M N P Q R S T V W Y)
11
+ EXPAND_AA = {'X' => STANDARD_AA}
12
+
13
+ DEFAULT_PEPTIDE_CENTRIC_DB = {
14
+ missed_cleavages: 2,
15
+ min_length: 4,
16
+ enzyme: Mspire::Digester[:trypsin],
17
+ remove_digestion_file: true,
18
+ cleave_initiator_methionine: true,
19
+ expand_aa: true,
20
+ uniprot: true
21
+ }
22
+
23
+ def self.cmdline(argv)
24
+
25
+ opt = {
26
+ :remove_digestion_file => true,
27
+ :enzyme => Mspire::Digester[:trypsin]
28
+ }
29
+ opts = OptionParser.new do |op|
30
+ op.banner = "usage: #{File.basename($0)} <file>.fasta ..."
31
+ op.separator "output: "
32
+ op.separator " <file>.msd_clvg<missed_cleavages>.min_aaseq<min_length>.yml"
33
+ op.separator "format:"
34
+ op.separator " PEPTIDE: ID1<tab>ID2<tab>ID3..."
35
+ op.separator ""
36
+ op.separator " Initiator Methionines - by default, will generate two peptides"
37
+ op.separator " for any peptide found at the N-termini starting with 'M'"
38
+ op.separator " (i.e., one with and one without the leading methionine)"
39
+ op.separator ""
40
+ op.on("--missed-cleavages <#{opt[:missed_cleavages]}>", Integer, "max num of missed cleavages") {|v| opt[:missed_cleavages] = v }
41
+ op.on("--min-length <#{opt[:min_length]}>", Integer, "the minimum peptide aaseq length") {|v| opt[:min_length] = v }
42
+ op.on("--no-cleaved-methionine", "does not cleave off initiator methionine") { opt[:cleave_initiator_methionine] = false }
43
+ op.on("--no-expand-x", "don't enumerate aa possibilities", "(removes these peptides)") { opt[:expand_aa] = false }
44
+ op.on("--no-uniprot", "use entire protid section of fasta header", "for non-uniprot fasta files") { opt[:uniprot] = false }
45
+ op.on("--trie", "use a trie (for very large uniprot files)", "must have fast_trie gem installed") {|v| opt[:trie] = v }
46
+ op.on("-e", "--enzyme <name>", "enzyme for digestion") {|v| opt[:enzyme] = Mspire::Insilico::Digester.const_get(v.upcase) }
47
+ op.on("--list-enzymes", "lists approved enzymes and exits") do
48
+ puts Mspire::Digester::ENZYMES.keys.join("\n")
49
+ exit
50
+ end
51
+ end
52
+
53
+ opts.parse!(argv)
54
+
55
+ if argv.size == 0
56
+ puts opts || exit
57
+ end
58
+
59
+ argv.map do |file|
60
+ creator = Mspire::Ident::Peptide::Db::Creator.new
61
+ creator.create(file, opt)
62
+ end
63
+ end
64
+
65
+ # returns the name of the digestion file that was written
66
+ def create_digestion_file(fasta_file, opts={})
67
+ opts = DEFAULT_PEPTIDE_CENTRIC_DB.merge(opts)
68
+
69
+ (missed_cleavages, enzyme, cleave_initiator_methionine, expand_aa) = opts.values_at(:missed_cleavages, :enzyme, :cleave_initiator_methionine, :expand_aa)
70
+ start_time = Time.now
71
+ print "Digesting #{fasta_file} ..." if $VERBOSE
72
+
73
+ letters_to_expand_re = Regexp.new("[" << Regexp.escape(EXPAND_AA.keys.join) << "]")
74
+
75
+ base = fasta_file.chomp(File.extname(fasta_file))
76
+ digestion_file = base + ".msd_clvg#{missed_cleavages}.peptides"
77
+ File.open(digestion_file, "w") do |fh|
78
+ Mspire::Fasta.open(fasta_file) do |fasta|
79
+ fasta.each do |prot|
80
+ peptides = enzyme.digest(prot.sequence, missed_cleavages)
81
+ if (cleave_initiator_methionine && (prot.sequence[0,1] == "M"))
82
+ m_peps = []
83
+ init_methionine_peps = []
84
+ peptides.each do |pep|
85
+ # if the peptide is at the beginning of the protein sequence
86
+ if prot.sequence[0,pep.size] == pep
87
+ m_peps << pep[1..-1]
88
+ end
89
+ end
90
+ peptides.push(*m_peps)
91
+ end
92
+ peptides =
93
+ if expand_aa
94
+ peptides.flat_map do |pep|
95
+ (pep =~ letters_to_expand_re) ? expand_peptides(pep, EXPAND_AA) : pep
96
+ end
97
+ else
98
+ peptides.map {|pep| pep =~ letters_to_expand_re }.compact
99
+ end
100
+ header = prot.header
101
+ id = opts[:uniprot] ? Mspire::Fasta.uniprot_id(header) : header.split(/\s+/).first
102
+ fh.puts( id + "\t" + peptides.join(" ") )
103
+ end
104
+ end
105
+ end
106
+ puts "#{Time.now - start_time} sec" if $VERBOSE
107
+ digestion_file
108
+ end
109
+
110
+ # returns the full path of the created file
111
+ def db_from_fasta_digestion_file(digestion_file, opts={})
112
+ opts = DEFAULT_PEPTIDE_CENTRIC_DB.merge(opts)
113
+
114
+ start_time = Time.now
115
+ puts "Organizing raw digestion #{digestion_file} ..." if $VERBOSE
116
+
117
+ puts "#{Time.now - start_time} sec" if $VERBOSE
118
+ hash_like = hash_like_from_digestion_file(digestion_file, opts[:min_length], opts[:trie])
119
+
120
+ base = digestion_file.chomp(File.extname(digestion_file))
121
+ final_outfile = base + ".min_aaseq#{opts[:min_length]}" + ".yml"
122
+
123
+ start_time = Time.now
124
+ print "Writing #{hash_like.size} peptides to #{} ..." if $VERBOSE
125
+
126
+ File.open(final_outfile, 'w') do |out|
127
+ hash_like.each do |k,v|
128
+ #out.puts( [k, v.join(Mspire::Ident::Peptide::Db::PROTEIN_DELIMITER)].join(Mspire::Ident::Peptide::Db::KEY_VALUE_DELIMITER) )
129
+ out.puts "#{k}#{Mspire::Ident::Peptide::Db::KEY_VALUE_DELIMITER}#{v}"
130
+ end
131
+ end
132
+ puts "#{Time.now - start_time} sec" if $VERBOSE
133
+
134
+ if opts[:remove_digestion_file]
135
+ File.unlink(digestion_file)
136
+ end
137
+ File.expand_path(final_outfile)
138
+ end
139
+
140
+ def hash_like_tree
141
+ require 'trie'
142
+ trie = Trie.new
143
+ def trie.[](key)
144
+ val = self.get(key)
145
+ if val.nil?
146
+ self.add(key,"")
147
+ self.get(key)
148
+ else
149
+ val
150
+ end
151
+ end
152
+ trie
153
+ end
154
+
155
+ def hash_like_from_digestion_file(digestion_file, min_length, use_trie=false)
156
+ cnt = 0
157
+ if use_trie
158
+ raise NotImplementedError
159
+ #puts "using trie" if $VERBOSE
160
+ #trie = hash_like_tree
161
+ #line_cnt = 0
162
+ #::IO.foreach(digestion_file) do |line|
163
+ #line_cnt += 1
164
+ ##puts "LINE COUND"
165
+ ##p line_cnt
166
+ #(prot, *peps) = line.chomp!.split(/\s+/)
167
+ ##p peps
168
+ ##p peps.class
169
+ ## prot is something like this: "P31946"
170
+ #puts line
171
+ #peps.each do |pep|
172
+ #if pep.size >= min_length
173
+ #to_set =
174
+ #if val = trie.get(pep)
175
+ #val + Mspire::Ident::Peptide::Db::PROTEIN_DELIMITER + prot
176
+ #else
177
+ #prot
178
+ #end
179
+ #p to_set.size
180
+ #trie.add(pep, to_set)
181
+ #end
182
+ #end
183
+ #cnt += 1
184
+ #puts cnt if (cnt % 1000) == 0
185
+ #end
186
+ #abort "HERE"
187
+ #trie
188
+ else
189
+ hash = {}
190
+ ::IO.foreach(digestion_file) do |line|
191
+ (prot, *peps) = line.chomp!.split(/\s+/)
192
+ # prot is something like this: "P31946"
193
+ peps.each do |pep|
194
+ if pep.size >= min_length
195
+ if val = hash[pep]
196
+ val << Mspire::Ident::Peptide::Db::PROTEIN_DELIMITER << prot
197
+ else
198
+ val = prot
199
+ end
200
+ hash[pep] = val
201
+ end
202
+ end
203
+ cnt += 1
204
+ puts cnt if (cnt % 1000) == 0
205
+ end
206
+ hash
207
+ end
208
+ end
209
+
210
+ # writes a new file with the added 'min_aaseq<Integer>'
211
+ # creates a temporary digestion file that contains all peptides digesting
212
+ # with certain missed_cleavages (i.e., min_seq_length is not applied to
213
+ # this file but on the final peptide centric db)
214
+ # returns the full name of the written file.
215
+ def create(fasta_file, opts={})
216
+ opts = DEFAULT_PEPTIDE_CENTRIC_DB.merge(opts)
217
+ digestion_file = create_digestion_file(fasta_file, opts)
218
+ db_from_fasta_digestion_file(digestion_file, opts)
219
+ end
220
+
221
+ # does combinatorial expansion of all letters requesting it.
222
+ # expand_aa is hash like: {'X'=>STANDARD_AA}
223
+ # returns nil if there are more than MAX_NUM_AA_EXPANSION amino acids to
224
+ # be expanded
225
+ # returns an empty array if there is no expansion
226
+ def expand_peptides(peptide, expand_aa_hash)
227
+ letters_in_order = expand_aa_hash.keys.sort
228
+ index_and_key = []
229
+ peptide.split('').each_with_index do |char,i|
230
+ if let_index = letters_in_order.index(char)
231
+ index_and_key << [i, letters_in_order[let_index]]
232
+ end
233
+ end
234
+ if index_and_key.size > MAX_NUM_AA_EXPANSION
235
+ return nil
236
+ end
237
+ to_expand = [peptide]
238
+ index_and_key.each do |i,letter|
239
+ new_peps = []
240
+ while current_pep = to_expand.shift do
241
+ new_peps << expand_aa_hash[letter].map {|v| dp = current_pep.dup ; dp[i] = v ; dp }
242
+ end
243
+ to_expand = new_peps.flatten
244
+ end
245
+ to_expand
246
+ end
247
+ end
248
+
@@ -0,0 +1,62 @@
1
+ require 'mspire/ident/peptide/db'
2
+
3
+ class Mspire::Ident::Peptide::Db::IO
4
+ # an object for on disk retrieval of db entries
5
+ # proteins are returned as an array.
6
+ # behaves like a hash once it is opened.
7
+ include Enumerable
8
+ def self.open(filename, &block)
9
+ raise ArgumentError unless block
10
+ File.open(filename) do |io|
11
+ block.call(self.new(io))
12
+ end
13
+ end
14
+
15
+ attr_accessor :io
16
+ attr_accessor :index
17
+
18
+ def initialize(io)
19
+ @io = io
20
+ @index = {}
21
+ re = /^(\w+)#{Regexp.escape(Mspire::Ident::Peptide::Db::KEY_VALUE_DELIMITER)}/
22
+ prev_io_pos = io.pos
23
+ triplets = io.each_line.map do |line|
24
+ key = re.match(line)[1]
25
+ [key, prev_io_pos + key.bytesize+Mspire::Ident::Peptide::Db::KEY_VALUE_DELIMITER.bytesize, prev_io_pos=io.pos]
26
+ end
27
+ triplets.each do |key, start, end_pos|
28
+ @index[key] = [start, end_pos-start]
29
+ end
30
+ end
31
+
32
+ # returns an array of proteins for the given key (peptide aaseq)
33
+ def [](key)
34
+ (start, length) = @index[key]
35
+ return nil unless start
36
+ @io.seek(start)
37
+ string = @io.read(length)
38
+ string.chomp!
39
+ string.split(Mspire::Ident::Peptide::Db::PROTEIN_DELIMITER)
40
+ end
41
+
42
+ # number of entries
43
+ def size ; @index.size end
44
+ alias_method :length, :size
45
+
46
+ def keys
47
+ @index.keys
48
+ end
49
+
50
+ # all the protein lists
51
+ def values
52
+ keys.map {|key| self[key] }
53
+ end
54
+
55
+ # yields a pair of aaseq and protein array
56
+ def each(&block)
57
+ @index.each do |key, start_length|
58
+ block.call([key, self[key]])
59
+ end
60
+ end
61
+ end
62
+
@@ -1,243 +1,36 @@
1
- require 'mspire/digester'
2
- require 'mspire/fasta'
3
- require 'optparse'
1
+ require 'yaml'
4
2
 
5
3
  module Mspire ; end
6
4
  module Mspire::Ident ; end
7
5
  module Mspire::Ident::Peptide ; end
8
6
 
9
- # the object itself is a modified Hash.
10
- # It is initialized with the database file and a protein array can be
11
- # retrieved with the #[] method given an amino acid sequence. All other
12
- # methods are untested at this time and should be avoided!
13
- class Mspire::Ident::Peptide::Db < Hash
14
- MAX_NUM_AA_EXPANSION = 3
15
-
16
- # the twenty standard amino acids
17
- STANDARD_AA = %w(A C D E F G H I K L M N P Q R S T V W Y)
18
-
19
- DEFAULT_PEPTIDE_CENTRIC_DB = {:missed_cleavages => 2, :min_length => 4, :enzyme => Mspire::Digester[:trypsin], :id_regexp => nil, :remove_digestion_file => true, :cleave_initiator_methionine => true, :expand_aa => {'X' => STANDARD_AA}}
20
-
7
+ # Very simple object for protein retrieval from a peptide-centric database
8
+ # See Mspire::Ident::Peptide::Db::IO for an on-disc version for larger files.
9
+ class Mspire::Ident::Peptide::Db
21
10
  PROTEIN_DELIMITER = "\t"
22
- KEY_VALUE_DELIMITER = ": "
23
-
24
- def self.cmdline(argv)
25
-
26
- opt = {
27
- :remove_digestion_file => true,
28
- :enzyme => Mspire::Digester[:trypsin]
29
- }
30
- opts = OptionParser.new do |op|
31
- op.banner = "usage: #{File.basename($0)} <file>.fasta ..."
32
- op.separator "output: "
33
- op.separator " <file>.msd_clvg<missed_cleavages>.min_aaseq<min_length>.yml"
34
- op.separator "format:"
35
- op.separator " PEPTIDE: ID1<tab>ID2<tab>ID3..."
36
- op.separator ""
37
- op.separator " Initiator Methionines - by default, will generate two peptides"
38
- op.separator " for any peptide found at the N-termini starting with 'M'"
39
- op.separator " (i.e., one with and one without the leading methionine)"
40
- op.separator ""
41
- op.on("--missed-cleavages <#{opt[:missed_cleavages]}>", Integer, "max num of missed cleavages") {|v| opt[:missed_cleavages] = v }
42
- op.on("--min-length <#{opt[:min_length]}>", Integer, "the minimum peptide aaseq length") {|v| opt[:min_length] = v }
43
- op.on("--no-cleaved-methionine", "does not cleave off initiator methionine") { opt[:cleave_initiator_methionine] = false }
44
- op.on("--no-expand-x", "don't enumerate aa 'X' possibilities") { opt[:expand_aa] = nil }
45
- op.on("-e", "--enzyme <name>", "enzyme for digestion") {|v| opt[:enzyme] = Mspire::Insilico::Digester.const_get(v.upcase) }
46
- op.on("--list-enzymes", "lists approved enzymes and exits") do
47
- puts Mspire::Digester::ENZYMES.keys.join("\n")
48
- exit
49
- end
50
- end
51
-
52
- opts.parse!(argv)
53
-
54
- if argv.size == 0
55
- puts opts || exit
56
- end
57
-
58
- argv.map do |file|
59
- Mspire::Ident::Peptide::Db.peptide_centric_db(file, opt)
60
- end
61
- end
62
-
63
- # writes a new file with the added 'min_aaseq<Integer>'
64
- # creates a temporary digestion file that contains all peptides digesting
65
- # with certain missed_cleavages (i.e., min_seq_length is not applied to
66
- # this file but on the final peptide centric db)
67
- # returns the full name of the written file.
68
- def self.peptide_centric_db(fasta_file, opts={})
69
- opts = DEFAULT_PEPTIDE_CENTRIC_DB.merge(opts)
70
-
71
- (missed_cleavages, min_length, enzyme, id_regexp, remove_digestion_file, cleave_initiator_methionine, expand_aa) = opts.values_at(:missed_cleavages, :min_length, :enzyme, :id_regexp, :remove_digestion_file, :cleave_initiator_methionine, :expand_aa)
72
- start_time = Time.now
73
- print "Digesting #{fasta_file} ..." if $VERBOSE
74
-
75
- if expand_aa
76
- letters_to_expand_re = Regexp.new("[" << Regexp.escape(expand_aa.keys.join) << "]")
77
- end
78
-
79
- base = fasta_file.chomp(File.extname(fasta_file))
80
- digestion_file = base + ".msd_clvg#{missed_cleavages}.peptides"
81
- File.open(digestion_file, "w") do |fh|
82
- Mspire::Fasta.open(fasta_file) do |fasta|
83
- fasta.each do |prot|
84
- peptides = enzyme.digest(prot.sequence, missed_cleavages)
85
- if (cleave_initiator_methionine && (prot.sequence[0,1] == "M"))
86
- m_peps = []
87
- init_methionine_peps = []
88
- peptides.each do |pep|
89
- # if the peptide is at the beginning of the protein sequence
90
- if prot.sequence[0,pep.size] == pep
91
- m_peps << pep[1..-1]
92
- end
93
- end
94
- peptides.push(*m_peps)
95
- end
96
- if expand_aa
97
- peptides = peptides.map do |pep|
98
- if pep =~ letters_to_expand_re
99
- expand_peptides(pep, expand_aa)
100
- else
101
- pep
102
- end
103
- end.flatten
104
- end
105
- fh.puts( prot.header.split(/\s+/).first + "\t" + peptides.join(" ") )
106
- end
107
- end
108
- end
109
- puts "#{Time.now - start_time} sec" if $VERBOSE
110
-
111
-
112
- start_time = Time.now
113
- print "Organizing raw digestion #{digestion_file} ..." if $VERBOSE
11
+ KEY_VALUE_DELIMITER = ': '
114
12
 
115
- hash = Hash.new {|h,k| h[k] = [] }
116
- ::IO.foreach(digestion_file) do |line|
117
- (prot, *peps) = line.chomp!.split(/\s+/)
118
- # prot is something like this: "sp|P31946|1433B_HUMAN" in uniprot
119
- peps.each do |pep|
120
- if pep.size >= min_length
121
- hash[pep] << prot
122
- end
123
- end
124
- end
125
- puts "#{Time.now - start_time} sec" if $VERBOSE
126
-
127
- base = digestion_file.chomp(File.extname(digestion_file))
128
- final_outfile = base + ".min_aaseq#{min_length}" + ".yml"
129
-
130
- start_time = Time.now
131
- print "Writing #{hash.size} peptides to #{} ..." if $VERBOSE
132
-
133
- File.open(final_outfile, 'w') do |out|
134
- hash.each do |k,v|
135
- out.puts( [k, v.join(PROTEIN_DELIMITER)].join(KEY_VALUE_DELIMITER) )
136
- end
137
- end
138
- puts "#{Time.now - start_time} sec" if $VERBOSE
139
-
140
- if remove_digestion_file
141
- File.unlink(digestion_file)
142
- end
143
- File.expand_path(final_outfile)
144
- end
145
-
146
- # does combinatorial expansion of all letters requesting it.
147
- # expand_aa is hash like: {'X'=>STANDARD_AA}
148
- # returns nil if there are more than MAX_NUM_AA_EXPANSION amino acids to
149
- # be expanded
150
- # returns an empty array if there is no expansion
151
- def self.expand_peptides(peptide, expand_aa)
152
- letters_in_order = expand_aa.keys.sort
153
- index_and_key = []
154
- peptide.split('').each_with_index do |char,i|
155
- if let_index = letters_in_order.index(char)
156
- index_and_key << [i, letters_in_order[let_index]]
157
- end
158
- end
159
- if index_and_key.size > MAX_NUM_AA_EXPANSION
160
- return nil
161
- end
162
- to_expand = [peptide]
163
- index_and_key.each do |i,letter|
164
- new_peps = []
165
- while current_pep = to_expand.shift do
166
- new_peps << expand_aa[letter].map {|v| dp = current_pep.dup ; dp[i] = v ; dp }
167
- end
168
- to_expand = new_peps.flatten
169
- end
170
- to_expand
171
- end
13
+ attr_accessor :data
172
14
 
173
15
  def initialize(db_file)
174
- self.replace(YAML.load_file(db_file))
16
+ @data = YAML.load_file(db_file)
175
17
  end
176
18
 
177
- alias_method :old_bracket, '[]'.to_sym
178
-
179
- # returns the protein id's as an array
19
+ # returns protein id's as an array
180
20
  def [](key)
181
- old_bracket(key).chomp.split(PROTEIN_DELIMITER)
21
+ val=@data[key]
22
+ val.chomp.split(PROTEIN_DELIMITER) if val
182
23
  end
183
24
 
184
- # an object for on disk retrieval of db entries
185
- # proteins are returned as an array.
186
- # behaves much like a hash once it is opened.
187
- class IO
188
- include Enumerable
189
- def self.open(filename, &block)
190
- raise ArgumentError unless block
191
- File.open(filename) do |io|
192
- block.call(self.new(io))
193
- end
194
- end
195
-
196
- attr_accessor :io
197
- attr_accessor :index
198
-
199
- def initialize(io)
200
- @io = io
201
- @index = {}
202
- re = /^(\w+)#{Regexp.escape(KEY_VALUE_DELIMITER)}/
203
- prev_io_pos = io.pos
204
- triplets = io.each_line.map do |line|
205
- key = re.match(line)[1]
206
- [key, prev_io_pos + key.bytesize+KEY_VALUE_DELIMITER.bytesize, prev_io_pos=io.pos]
207
- end
208
- triplets.each do |key, start, end_pos|
209
- @index[key] = [start, end_pos-start]
210
- end
211
- end
212
-
213
- # returns an array of proteins for the given key (peptide aaseq)
214
- def [](key)
215
- (start, length) = @index[key]
216
- return nil unless start
217
- @io.seek(start)
218
- string = @io.read(length)
219
- string.chomp!
220
- string.split("\t")
221
- end
222
-
223
- # number of entries
224
- def size ; @index.size end
225
- alias_method :length, :size
226
-
227
- def keys
228
- @index.keys
229
- end
25
+ def keys
26
+ @data.keys
27
+ end
230
28
 
231
- # all the protein lists
232
- def values
233
- keys.map {|key| self[key] }
234
- end
29
+ def values
30
+ @data.values
31
+ end
235
32
 
236
- # yields a pair of aaseq and protein array
237
- def each(&block)
238
- @index.each do |key, start_length|
239
- block.call([key, self[key]])
240
- end
241
- end
33
+ def size
34
+ @data.size
242
35
  end
243
36
  end
@@ -19,12 +19,13 @@ class Mspire::Ident::PeptideHit
19
19
 
20
20
  # writes the peptide hits to a phq.tsv file. qvalues is a parallel array
21
21
  # to hits that can provide qvalues if not inherent to the hits
22
- # returns the filename.
22
+ # returns the filename. Expects each hit to implement #search_id, #id,
23
+ # #aaseq and #charge
23
24
  def to_file(filename, hits, qvalues=[])
24
25
  File.open(filename,'w') do |out|
25
26
  out.puts HEADER.join(FILE_DELIMITER)
26
27
  hits.zip(qvalues) do |hit, qvalue|
27
- out.puts [hit.search.id, hit.id, hit.aaseq, hit.charge, qvalue || hit.qvalue].join(FILE_DELIMITER)
28
+ out.puts [hit.search_id, hit.id, hit.aaseq, hit.charge, qvalue || hit.qvalue].join(FILE_DELIMITER)
28
29
  end
29
30
  end
30
31
  filename
@@ -0,0 +1,5 @@
1
+ # structure of a very simple file for holding peptide hit qvalues
2
+ # entries should be separated by a tab!!!
3
+ aaseq charge qvalue
4
+ String Integer Float
5
+ ... ... ...