ms-sequest 0.0.17 → 0.0.18

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,316 @@
1
+ require 'ms/ident/pepxml'
2
+ require 'ms/ident/pepxml/spectrum_query'
3
+ require 'ms/ident/pepxml/search_result'
4
+ require 'ms/ident/pepxml/search_hit'
5
+ require 'ms/msrun'
6
+ require 'ms/sequest/srf'
7
+ require 'ms/sequest/pepxml'
8
+
9
+ class Ms::Sequest::Srf
10
+ module Pepxml
11
+
12
+ # A hash with the following *symbol* keys may be set:
13
+ #
14
+ # Run Info
15
+ # *:ms_model*:: nil
16
+ # *:ms_ionization*:: 'ESI'
17
+ # *:ms_detector*:: 'UNKNOWN'
18
+ # *:ms_mass_analyzer*:: nil - <i>typically extracted from the srf file and matched with <b>ModelToMsAnalyzer</b></i>
19
+ # *:ms_manufacturer*:: 'Thermo'
20
+ #
21
+ # Raw data
22
+ # *:mz_dir*:: nil - <i>path to the mz[X]ML directory, defaults to the directory the srf file is contained in. mz[X]ML data must be available to embed retention times</i>
23
+ # *:raw_data*:: \['.mzML', '.mzXML'\] - <i>preferred extension for raw data</i>
24
+ #
25
+ # Database
26
+ # *:db_seq_type*:: 'AA' - <i>AA or NA</i>
27
+ # *:db_dir*:: nil - <i>the directory the fasta file used for the search is housed in. A valid pepxml file must point to a valid fasta file!</i>
28
+ # *:db_residue_size*:: nil - <i>An integer for the number of residues in the database. if true, calculates the size of the fasta database.</i>
29
+ # *:db_name:: nil
30
+ # *:db_orig_database_url*:: nil
31
+ # *:db_release_date*:: nil
32
+ # *:db_release_identifier*:: nil
33
+ #
34
+ # Search Hits
35
+ # *:num_hits*:: 1 - <i>the top number of hits to include</i>
36
+ # *:retention_times*:: false - <i>include retention times in the file (requires mz_dir to be set)</i>
37
+ # *:deltacn_orig*:: false - <i>when true, the original SEQUEST deltacn values are used. If false, Bioworks deltacn values are used which are derived by taking the original deltacn of the following hit. This gives the top ranking hit an informative deltacn but makes the deltacn meaningless for other hits.</i>
38
+ #
39
+ # *:pepxml_version*:: Ms::Ident::Pepxml::DEFAULT_PEPXML_VERSION, - <i>Integer to set the pepxml version. The converter and xml output attempts to produce xml specific to the version.</i>
40
+ # *:verbose*:: true - <i>set to false to quiet warnings</i>
41
+ DEFAULT_OPTIONS = {
42
+ :ms_model => nil,
43
+ :ms_ionization => 'ESI',
44
+ :ms_detector => 'UNKNOWN',
45
+ :ms_mass_analyzer => nil,
46
+ :ms_manufacturer => 'Thermo',
47
+
48
+ :mz_dir => nil,
49
+ #:raw_data => [".mzXML", '.mzML'],
50
+ :raw_data => ['.mzML', '.mzXML'],
51
+
52
+ :db_seq_type => 'AA',
53
+ :db_dir => nil,
54
+ :db_residue_size => nil,
55
+ :db_name => nil,
56
+ :db_orig_database_url => nil,
57
+ :db_release_date => nil,
58
+ :db_release_identifier => nil,
59
+
60
+ :num_hits => 1,
61
+ :retention_times => false,
62
+ :deltacn_orig => false,
63
+
64
+ :pepxml_version => Ms::Ident::Pepxml::DEFAULT_PEPXML_VERSION,
65
+ :verbose => true,
66
+ }
67
+
68
+ # An array of regexp to string pairs. The regexps are matched against the
69
+ # model (srf.header.model) and the corresponding string will be used as
70
+ # the mass analyzer.
71
+ #
72
+ # /Orbitrap/:: 'Orbitrap'
73
+ # /LCQ Deca XP/:: 'Ion Trap'
74
+ # /LTQ/:: 'Ion Trap'
75
+ # /\w+/:: 'UNKNOWN'
76
+ ModelToMsAnalyzer = [
77
+ [/Orbitrap/, 'Orbitrap'],
78
+ [/LCQ Deca XP/, 'Ion Trap'],
79
+ [/LTQ/, 'Ion Trap'],
80
+ [/\w+/, 'UNKNOWN'],
81
+ ]
82
+
83
+ # returns an Ms::Ident::Pepxml object. See that object for creating an
84
+ # xml string or writing to file.
85
+ def to_pepxml(opts={})
86
+ opt = DEFAULT_OPTIONS.merge(opts)
87
+ srf = self
88
+
89
+ # with newer pepxml version these are not required anymore
90
+ hidden_opts = {
91
+ # format of file storing the runner up peptides (if not present in
92
+ # pepXML) this was made optional after version 19
93
+ :out_data_type => "out", ## may be srf??
94
+ # runner up search hit data type extension (e.g. .tgz)
95
+ :out_data => ".srf",
96
+ }
97
+ opt.merge!(hidden_opts)
98
+
99
+ params = srf.params
100
+ header = srf.header
101
+
102
+ opt[:ms_model] ||= srf.header.model
103
+
104
+ unless opt[:ms_mass_analyzer]
105
+ ModelToMsAnalyzer.each do |regexp, val|
106
+ if opt[:ms_model].match(regexp)
107
+ opt[:ms_mass_analyzer] = val
108
+ break
109
+ end
110
+ end
111
+ end
112
+
113
+ # get the database name
114
+ db_filename = header.db_filename.sub(/\.hdr$/, '')
115
+ if opt[:db_dir]
116
+ db_filename = File.join(opt[:db_dir], db_filename.split(/[\/\\]+/).last)
117
+ end
118
+ if File.exist?(db_filename)
119
+ db_filename = File.expand_path(db_filename)
120
+ else
121
+ msg = ["!!! WARNING !!!"]
122
+ msg << "!!! Can't find database: #{db_filename}"
123
+ msg << "!!! pepxml *requires* that the db path be valid"
124
+ msg << "!!! make sure 1) the fasta file is available on this system"
125
+ msg << "!!! 2) you've specified a valid directory with --db-dir (or :db_dir)"
126
+ puts msg.join("\n") if opt[:verbose]
127
+ end
128
+
129
+ modifications_obj = Ms::Sequest::Pepxml::Modifications.new(params, srf.header.modifications)
130
+ mass_index = params.mass_index(:precursor)
131
+ h_plus = mass_index['h+']
132
+
133
+ opt[:mz_dir] ||= srf.resident_dir
134
+ found_ext = opt[:raw_data].find do |raw_data|
135
+ Dir[File.join(opt[:mz_dir], srf.base_name_noext + raw_data)].first
136
+ end
137
+ opt[:raw_data] = [found_ext] if found_ext
138
+
139
+ scan_to_ret_time =
140
+ if opt[:retention_times]
141
+ mz_file = Dir[File.join(opt[:mz_dir], srf.base_name_noext + opt[:raw_data].first)].first
142
+ if mz_file
143
+ Ms::Msrun.scans_to_times(mz_file)
144
+ else
145
+ warn "turning retention_times off since no valid mz[X]ML file was found!!!"
146
+ opt[:retention_times] = false
147
+ nil
148
+ end
149
+ end
150
+
151
+ summary_xml_filename = srf.base_name_noext + '.xml'
152
+
153
+ pepxml = Ms::Ident::Pepxml.new do |msms_pipeline_analysis|
154
+ msms_pipeline_analysis.merge!(:summary_xml => summary_xml_filename, :pepxml_version => opt[:pepxml_version]) do |msms_run_summary|
155
+ # prep the sample enzyme and search_summary
156
+ msms_run_summary.merge!(
157
+ :base_name => File.join(opt[:mz_dir], srf.base_name_noext),
158
+ :ms_manufacturer => opt[:ms_manufacturer],
159
+ :ms_model => opt[:ms_model],
160
+ :ms_ionization => opt[:ms_ionization],
161
+ :ms_mass_analyzer => opt[:ms_mass_analyzer],
162
+ :ms_detector => opt[:ms_detector],
163
+ :raw_data => opt[:raw_data].first,
164
+ :raw_data_type => opt[:raw_data].first,
165
+ ) do |sample_enzyme, search_summary, spectrum_queries|
166
+ sample_enzyme.merge!(params.sample_enzyme_hash)
167
+ search_summary.merge!(
168
+ :base_name=> srf.resident_dir + '/' + srf.base_name_noext,
169
+ :search_engine => 'SEQUEST',
170
+ :precursor_mass_type => params.precursor_mass_type,
171
+ :fragment_mass_type => params.fragment_mass_type,
172
+ :out_data_type => opt[:out_data_type],
173
+ :out_data => opt[:out_data],
174
+ ) do |search_database, enzymatic_search_constraint, modifications_ar, parameters_hash|
175
+ search_database.merge!(:local_path => db_filename, :seq_type => opt[:db_seq_type], :database_name => opt[:db_name], :orig_database_url => opt[:db_orig_database_url], :database_release_date => opt[:db_release_date], :database_release_identifier => opt[:db_release_identifier])
176
+
177
+ case opt[:db_residue_size]
178
+ when Integer
179
+ search_database.size_of_residues = opt[:db_residue_size]
180
+ when true
181
+ search_database.set_size_of_residues!
182
+ end
183
+
184
+ enzymatic_search_constraint.merge!(
185
+ :enzyme => params.enzyme,
186
+ :max_num_internal_cleavages => params.max_num_internal_cleavages,
187
+ :min_number_termini => params.min_number_termini,
188
+ )
189
+ modifications_ar.replace(modifications_obj.modifications)
190
+ parameters_hash.merge!(params.opts)
191
+ end
192
+
193
+ spec_queries = srf.dta_files.zip(srf.out_files, index).map do |dta_file,out_file,i_ar|
194
+ precursor_neutral_mass = dta_file.mh - h_plus
195
+
196
+ search_hits = out_file.hits[0,opt[:num_hits]].each_with_index.map do |pep,i|
197
+ (prev_aa, pure_aaseq, next_aa) = Ms::Ident::Peptide.prepare_sequence(pep.sequence)
198
+ calc_neutral_pep_mass = pep.mh - h_plus
199
+ sh = Ms::Ident::Pepxml::SearchHit.new(
200
+ :hit_rank => i+1,
201
+ :peptide => pure_aaseq,
202
+ :peptide_prev_aa => prev_aa,
203
+ :peptide_next_aa => next_aa,
204
+ :protein => pep.proteins.first.reference.split(' ')[0],
205
+ :num_tot_proteins => pep.proteins.size,
206
+ :num_matched_ions => pep.ions_matched,
207
+ :tot_num_ions => pep.ions_total,
208
+ :calc_neutral_pep_mass => calc_neutral_pep_mass,
209
+ :massdiff => precursor_neutral_mass - calc_neutral_pep_mass,
210
+ :num_tol_term => sample_enzyme.num_tol_term(prev_aa, pure_aaseq, next_aa),
211
+ :num_missed_cleavages => sample_enzyme.num_missed_cleavages(pure_aaseq),
212
+ :modification_info => modifications_obj.modification_info(Ms::Ident::Peptide.split_sequence(pep.sequence)[1])
213
+ ) do |search_scores|
214
+ if opt[:deltacn_orig]
215
+ deltacn = pep.deltacn_orig
216
+ deltacnstar = nil
217
+ else
218
+ deltacn = pep.deltacn
219
+ deltacn = 1.0 if deltacn == 1.1
220
+ deltcnstar = out_file.hits[i+1].nil? ? '1' : '0'
221
+ end
222
+ search_scores.merge!( :xcorr => pep.xcorr, :deltcn => deltacn,
223
+ :spscore => pep.sp, :sprank => pep.rsp)
224
+ search_scores[:deltacnstar] = deltacnstar if deltacnstar
225
+ end
226
+ end
227
+
228
+ sr = Ms::Ident::Pepxml::SearchResult.new(:search_hits => search_hits)
229
+
230
+ ret_time =
231
+ if opt[:retention_times]
232
+ (first_scan, last_scan) = i_ar[0,2]
233
+ if first_scan==last_scan
234
+ scan_to_ret_time[i_ar[0]]
235
+ else
236
+ times = ((i_ar[0])..(i_ar[1])).step(1).map {|i| scan_to_ret_time[i] }.compact
237
+ times.inject(&:+) / times.size.to_f
238
+ end
239
+ end
240
+ Ms::Ident::Pepxml::SpectrumQuery.new(
241
+ :spectrum => [srf.base_name_noext, *i_ar].join('.'), :start_scan => i_ar[0], :end_scan => i_ar[1],
242
+ :precursor_neutral_mass => dta_file.mh - h_plus, :assumed_charge => i_ar[2],
243
+ :retention_time_sec => ret_time,
244
+ :search_results => [sr],
245
+ )
246
+ end
247
+ spectrum_queries.replace(spec_queries)
248
+ end
249
+ end
250
+ end
251
+ pepxml
252
+ end # to_pepxml
253
+ end # Srf::Pepxml
254
+ include Pepxml
255
+ end # Srf
256
+
257
+
258
+ require 'trollop'
259
+
260
+ module Ms::Sequest::Srf::Pepxml
261
+ def self.commandline(argv, progname=$0)
262
+ opts = Trollop::Parser.new do
263
+ banner %Q{
264
+ usage: #{progname} [OPTIONS] <file>.srf ...
265
+ output: <file>.xml ...
266
+ }.lines.map(&:lstrip).join
267
+
268
+ text ""
269
+ text "major options:"
270
+ opt :db_dir, "The dir holding the DB if different than in Srf. (pepxml requires a valid database path)", :type => :string
271
+ opt :mz_dir, "directory holding mz[X]ML files (defaults to the folder holding the srf file)", :type => :string
272
+ opt :retention_times, "include retention times (requires mz-dir)"
273
+ opt :deltacn_orig, "use original deltacn values created by SEQUEST. By default, the top hit gets the next hit's original deltacn."
274
+ opt :no_filter, "do not filter hits by peptide_mass_tolerance (per sequest params)"
275
+ opt :num_hits, "include N top hits", :default => 1
276
+ opt :outdirs, "list of output directories", :type => :strings
277
+ opt :quiet, "do not print warnings, etc."
278
+
279
+ text ""
280
+ text "minor options:"
281
+ opt :ms_model, 'mass spectrometer model', :type => :string
282
+ opt :ms_ionization, 'type of ms ionization', :default => 'ESI'
283
+ opt :ms_detector, 'ms detector', :default => 'UNKNOWN'
284
+ opt :ms_mass_analyzer, 'ms mass analyzer', :type => :string
285
+ opt :ms_manufacturer, 'ms manufacturer', :default => 'Thermo'
286
+ opt :raw_data, 'preferred extension for raw data', :default => '.mzXML'
287
+ opt :db_seq_type, "'AA' or 'NA'", :default => 'AA'
288
+ opt :db_residue_size, 'calculate the size of the fasta file'
289
+ opt :db_name, 'the database name', :type => :string
290
+ opt :db_orig_database_url, 'original database url', :type => :string
291
+ opt :db_release_date, 'database release date', :type => :string
292
+ opt :db_release_identifier, 'the database release identifier', :type => :string
293
+ end
294
+
295
+ opt = opts.parse argv
296
+ opts.educate && exit if argv.empty?
297
+
298
+ Trollop.die :outdirs, "outdirs must be same size as number of input files" if opt.outdirs && opt.outdirs.size != argv.size
299
+ opt[:filter] = !opt.delete(:no_filter)
300
+ opt[:outdirs] ||= []
301
+ opt[:raw_data] = [opt[:raw_data]] if opt[:raw_data]
302
+ opt[:verbose] = !opt[:quiet]
303
+
304
+ argv.zip(opt.delete(:outdirs)) do |srf_file,outdir|
305
+ outdir ||= File.dirname(srf_file)
306
+ srf = Ms::Sequest::Srf.new(srf_file, :link_protein_hits => false, :filter_by_precursor_mass_tolerance => opt.delete(:filter))
307
+ pepxml = srf.to_pepxml(opt)
308
+ outfile = pepxml.to_xml(outdir)
309
+ puts "wrote file: #{outfile}" if opt[:verbose]
310
+ end
311
+ end
312
+ end
313
+
314
+
315
+
316
+
@@ -0,0 +1,21 @@
1
+ module Ms ; end
2
+ module Ms::Ident ; end
3
+
4
+ class Ms::Ident::Pepxml
5
+ class SearchHit
6
+ Sequest = Struct.new(:xcorr, :deltacn, :deltacnstar, :spscore, :sprank) do
7
+
8
+ # Takes ions in the form XX/YY and returns [XX.to_i, YY.to_i]
9
+ def self.split_ions(ions)
10
+ ions.split("/").map {|ion| ion.to_i }
11
+ end
12
+
13
+ def to_xml(builder)
14
+ members.zip(self.to_a) do |sym, val|
15
+ builder.search_score(:name => sym, :value => val)
16
+ end
17
+ end
18
+ end
19
+ end
20
+ end
21
+
@@ -159,7 +159,7 @@ module Ms
159
159
  end
160
160
  # note that the rank is determined by the order..
161
161
  out.puts ['M', index+1, hit.rsp, hit_mh, hit_deltacn_orig_updated, hit_xcorr, hit_sp, hit.ions_matched, hit.ions_total, hit.sequence, manual_validation_status].join("\t")
162
- hit.prots.each do |prot|
162
+ hit.proteins.each do |prot|
163
163
  out.puts ['L', prot.first_entry].join("\t")
164
164
  end
165
165
  end
@@ -13,10 +13,10 @@ describe Bioworks, 'set from an xml file' do
13
13
  it 'can set one with labeled proteins' do
14
14
  file = Tfiles + "/bioworks_with_INV_small.xml"
15
15
  obj = Bioworks.new(file)
16
- obj.prots.size.should == 19
16
+ obj.proteins.size.should == 19
17
17
  file = Tfiles + '/bioworks_small.xml'
18
18
  obj = Bioworks.new(file)
19
- obj.prots.size.should == 106
19
+ obj.proteins.size.should == 106
20
20
  end
21
21
 
22
22
  it 'can parse an xml file NOT derived from multi-concensus' do
@@ -28,10 +28,10 @@ describe Bioworks, 'set from an xml file' do
28
28
  obj.global_filename.should == gfn
29
29
  obj.origfilename.should == origfilename
30
30
  obj.origfilepath.should == origfilepath
31
- obj.prots.size.should == 7
32
- obj.prots.first.peps.first.base_name.should == gfn
33
- obj.prots.first.peps.first.file.should == "152"
34
- obj.prots.first.peps.first.charge.should == 2
31
+ obj.proteins.size.should == 7
32
+ obj.proteins.first.peptides.first.base_name.should == gfn
33
+ obj.proteins.first.peptides.first.file.should == "152"
34
+ obj.proteins.first.peptides.first.charge.should == 2
35
35
  # @TODO: add more tests here
36
36
  end
37
37
 
@@ -57,7 +57,7 @@ describe Bioworks, 'set from an xml file' do
57
57
  def _assert_equal_pieces(exp, act, prot)
58
58
  # equal as floats (by delta)
59
59
  exp.each_index do |i|
60
- if i == 5 # both prots and peps
60
+ if i == 5 # both proteins and peptides
61
61
  act[i].to_f.should be_close(exp[i].to_f, 0.1)
62
62
  elsif i == 3 && !prot
63
63
  act[i].to_f.should be_close(exp[i].to_f, 0.01)
@@ -99,7 +99,7 @@ describe Bioworks, 'set from an xml file' do
99
99
  end
100
100
  exp_peps = exp_peps.zip(exp_prots)
101
101
  exp_peps.collect! do |both|
102
- both[0].prots = [both[1]]
102
+ both[0].proteins = [both[1]]
103
103
  both[0]
104
104
  end
105
105
 
@@ -107,8 +107,8 @@ describe Bioworks, 'set from an xml file' do
107
107
  pep = Bioworks::Pep.new
108
108
  pep.charge = arr[0]
109
109
  pep.sequence = arr[1]
110
- pep.prots = [Bioworks::Prot.new]
111
- pep.prots.first.reference = "#{cnt}"
110
+ pep.proteins = [Bioworks::Prot.new]
111
+ pep.proteins.first.reference = "#{cnt}"
112
112
  cnt += 1
113
113
  pep
114
114
  end
@@ -130,7 +130,7 @@ end
130
130
 
131
131
  describe Bioworks::Pep do
132
132
  it 'can be initialized from a hash' do
133
- hash = {:sequence => 0, :mass => 1, :deltamass => 2, :charge => 3, :xcorr => 4, :deltacn => 5, :sp => 6, :rsp => 7, :ions => 8, :count => 9, :tic => 10, :prots => 11, :base_name => 12, :first_scan => 13, :last_scan => 14, :peptide_probability => 15, :file => 16, :_num_prots => 17, :_first_prot => 18}
133
+ hash = {:sequence => 0, :mass => 1, :deltamass => 2, :charge => 3, :xcorr => 4, :deltacn => 5, :sp => 6, :rsp => 7, :ions => 8, :count => 9, :tic => 10, :proteins => 11, :base_name => 12, :first_scan => 13, :last_scan => 14, :peptide_probability => 15, :file => 16, :_num_proteins => 17, :_first_prot => 18}
134
134
  pep = Bioworks::Pep.new(hash)
135
135
  hash.each do |k,v|
136
136
  pep.send(k).should == v
@@ -0,0 +1,50 @@
1
+ require 'spec_helper'
2
+
3
+ require 'ms/sequest/params'
4
+ require 'ms/sequest/pepxml/modifications'
5
+
6
+ describe 'Ms::Sequest::Pepxml::Modifications' do
7
+ before do
8
+ tf_params = TESTFILES + "/bioworks32.params"
9
+ @params = Ms::Sequest::Params.new(tf_params)
10
+ # The params object here is completely unnecessary for this test, except
11
+ # that it sets up the mass table
12
+ @obj = Ms::Sequest::Pepxml::Modifications.new(@params, "(M* +15.90000) (M# +29.00000) (S@ +80.00000) (C^ +12.00000) (ct[ +12.33000) (nt] +14.20000) ")
13
+ end
14
+ it 'creates a mod_symbols_hash' do
15
+ answ = {[:C, 12.0]=>"^", [:S, 80.0]=>"@", [:M, 29.0]=>"#", [:M, 15.9]=>"*", [:ct, 12.33]=>"[", [:nt, 14.2]=>"]"}
16
+ @obj.mod_symbols_hash.should == answ
17
+ ## need more here
18
+ end
19
+
20
+ it 'creates a ModificationInfo object given a special peptide sequence' do
21
+ mod_string = "(M* +15.90000) (M# +29.00000) (S@ +80.00000) (C^ +12.00000) (ct[ +12.33000) (nt] +14.20000) "
22
+ @params.diff_search_options = "15.90000 M 29.00000 M 80.00000 S 12.00000 C"
23
+ @params.term_diff_search_options = "14.20000 12.33000"
24
+ mod = Ms::Sequest::Pepxml::Modifications.new(@params, mod_string)
25
+ ## no mods
26
+ peptide_nomod = "PEPTIDE"
27
+ ok mod.modification_info(peptide_nomod).nil?
28
+ peptide_mod = "]M*EC^S@IDM#M*EMSCM["
29
+ modinfo = mod.modification_info(peptide_mod)
30
+
31
+ xml_string = modinfo.to_xml
32
+ xml_string.matches /<mod_aminoacid_mass /
33
+ xml_string.matches /mod_nterm_mass=/
34
+ xml_string.matches /mod_cterm_mass=/
35
+ xml_string.matches /modified_peptide=/
36
+
37
+ modinfo.mod_aminoacid_masses.size.is 5
38
+ mod_aa_masses = modinfo.mod_aminoacid_masses
39
+ # positions are verified, masses are just frozen
40
+ [1,3,4,7,8].zip([147.09606, 115.1429, 167.0772999, 160.19606, 147.09606], mod_aa_masses) do |pos, mass, obj|
41
+ obj.position.is pos
42
+ obj.mass.should.be.close mass, 0.0001
43
+ end
44
+ # These values are just frozen and not independently verified yet
45
+ modinfo.mod_nterm_mass.should.be.close 146.4033, 0.0001
46
+ modinfo.mod_cterm_mass.should.be.close 160.5334, 0.0001
47
+ end
48
+
49
+ end
50
+