mspire-sequest 0.2.5

Sign up to get free protection for your applications and to get access to all the features.
Files changed (40) hide show
  1. data/.autotest +30 -0
  2. data/.gitmodules +9 -0
  3. data/History +79 -0
  4. data/LICENSE +22 -0
  5. data/README.rdoc +85 -0
  6. data/Rakefile +52 -0
  7. data/VERSION +1 -0
  8. data/bin/srf_to_pepxml.rb +7 -0
  9. data/bin/srf_to_search.rb +7 -0
  10. data/bin/srf_to_sqt.rb +8 -0
  11. data/lib/mspire/sequest/params.rb +331 -0
  12. data/lib/mspire/sequest/pepxml/modifications.rb +247 -0
  13. data/lib/mspire/sequest/pepxml/params.rb +32 -0
  14. data/lib/mspire/sequest/sqt.rb +393 -0
  15. data/lib/mspire/sequest/srf/pepxml/sequest.rb +21 -0
  16. data/lib/mspire/sequest/srf/pepxml.rb +333 -0
  17. data/lib/mspire/sequest/srf/search.rb +158 -0
  18. data/lib/mspire/sequest/srf/sqt.rb +218 -0
  19. data/lib/mspire/sequest/srf.rb +715 -0
  20. data/lib/mspire/sequest.rb +6 -0
  21. data/script/fasta_ipi_to_ncbi-ish.rb +29 -0
  22. data/spec/mspire/sequest/params_spec.rb +135 -0
  23. data/spec/mspire/sequest/pepxml/modifications_spec.rb +50 -0
  24. data/spec/mspire/sequest/pepxml_spec.rb +311 -0
  25. data/spec/mspire/sequest/sqt_spec.rb +51 -0
  26. data/spec/mspire/sequest/sqt_spec_helper.rb +34 -0
  27. data/spec/mspire/sequest/srf/pepxml_spec.rb +89 -0
  28. data/spec/mspire/sequest/srf/search_spec.rb +131 -0
  29. data/spec/mspire/sequest/srf/sqt_spec.rb +228 -0
  30. data/spec/mspire/sequest/srf_spec.rb +113 -0
  31. data/spec/mspire/sequest/srf_spec_helper.rb +172 -0
  32. data/spec/spec_helper.rb +22 -0
  33. data/spec/testfiles/7MIX_STD_110802_1.sequest_params_fragment.srf +0 -0
  34. data/spec/testfiles/bioworks31.params +77 -0
  35. data/spec/testfiles/bioworks32.params +62 -0
  36. data/spec/testfiles/bioworks33.params +63 -0
  37. data/spec/testfiles/corrupted_900.srf +0 -0
  38. data/spec/testfiles/small.sqt +87 -0
  39. data/spec/testfiles/small2.sqt +176 -0
  40. metadata +185 -0
@@ -0,0 +1,218 @@
1
+ require 'mspire/sequest'
2
+ require 'mspire/sequest/srf'
3
+ require 'mspire/sequest/sqt'
4
+
5
+
6
+ module Mspire
7
+ module Sequest
8
+ class Srf
9
+
10
+ module Sqt
11
+
12
+ # the out_filename will be the base_name + .sqt unless 'out_filename' is
13
+ # defined
14
+ # :round => round floating point numbers
15
+ # etc...
16
+ def to_sqt(out_filename=nil, opts={})
17
+ # default rounding precision (Decimal Places)
18
+ tic_dp = 2
19
+ mh_dp = 7
20
+ xcorr_dp = 5
21
+ sp_dp = 2
22
+ dcn_dp = 5
23
+
24
+ defaults = {:db_info=>false, :new_db_path=>nil, :update_db_path=>false, :round=>false}
25
+ opt = defaults.merge(opts)
26
+
27
+ outfile =
28
+ if out_filename
29
+ out_filename
30
+ else
31
+ base_name + '.sqt'
32
+ end
33
+ invariant_ordering = %w(SQTGenerator SQTGeneratorVersion Database FragmentMasses PrecursorMasses StartTime) # just for readability and consistency
34
+ fmt =
35
+ if params.fragment_mass_type == 'average' ; 'AVG'
36
+ else ; 'MONO'
37
+ end
38
+ pmt =
39
+ if params.precursor_mass_type == 'average' ; 'AVG'
40
+ else ; 'MONO'
41
+ end
42
+
43
+ mass_index = params.mass_index
44
+ static_mods = params.static_mods.map do |k,v|
45
+ key = k.split(/_/)[1]
46
+ if key.size == 1
47
+ key + '=' + (mass_index[key] + v.to_f).to_s
48
+ else
49
+ key + '=' + v
50
+ end
51
+ end
52
+
53
+ dynamic_mods = []
54
+ header.modifications.scan(/\((.*?)\)/) do |match|
55
+ dynamic_mods << match.first.sub(/ /,'=')
56
+ end
57
+ plural = {
58
+ 'StaticMod' => static_mods,
59
+ 'DynamicMod' => dynamic_mods, # example as diff mod
60
+ 'Comment' => ['Created from Bioworks .srf file']
61
+ }
62
+
63
+ db_filename = header.db_filename.sub(/\.hdr$/, '') # remove the .hdr postfix
64
+ db_filename_in_sqt = db_filename
65
+ if opt[:new_db_path]
66
+ db_filename = File.join(opt[:new_db_path], File.basename(db_filename.gsub('\\', '/')))
67
+ if opt[:update_db_path]
68
+ db_filename_in_sqt = File.expand_path(db_filename)
69
+ warn "writing Database #{db_filename} to sqt, but it does not exist on this file system" unless File.exist?(db_filename)
70
+ end
71
+ end
72
+
73
+ apmu =
74
+ case params.peptide_mass_units
75
+ when '0' ; 'amu'
76
+ when '1' ; 'mmu'
77
+ when '2' ; 'ppm'
78
+ end
79
+
80
+ hh = {
81
+ 'SQTGenerator' => "mspire: ms-sequest",
82
+ 'SQTGeneratorVersion' => Mspire::Sequest::VERSION,
83
+ 'Database' => db_filename_in_sqt,
84
+ 'FragmentMasses' => fmt,
85
+ 'PrecursorMasses' => pmt,
86
+ 'StartTime' => '', # Bioworks 3.2 also leaves this blank...
87
+ 'Alg-PreMassTol' => params.peptide_mass_tolerance,
88
+ 'Alg-FragMassTol' => params.fragment_ion_tolerance,
89
+ 'Alg-PreMassUnits' => apmu, ## mine
90
+ 'Alg-IonSeries' => header.ion_series.split(':').last.lstrip,
91
+ 'Alg-Enzyme' => header.enzyme.split(':').last,
92
+ 'Alg-MSModel' => header.model,
93
+ }
94
+
95
+ if opt[:db_info]
96
+ if File.exist?(db_filename)
97
+ reply = Mspire::Sequest::Sqt.db_info(db_filename)
98
+ %w(DBSeqLength DBLocusCount DBMD5Sum).zip(reply) do |label,val|
99
+ hh[label] = val
100
+ end
101
+ else
102
+ warn "file #{db_filename} does not exist, no extra db info in header!"
103
+ end
104
+ end
105
+
106
+ has_hits = (self.out_files.size > 0)
107
+ if has_hits
108
+ # somewhat redundant with above, but we can get this without a db present!
109
+ hh['DBLocusCount'] = self.out_files.first.db_locus_count
110
+ end
111
+
112
+ File.open(outfile, 'w') do |out|
113
+ # print the header:
114
+ invariant_ordering.each do |iv|
115
+ out.puts ['H', iv, hh.delete(iv)].join("\t")
116
+ end
117
+ hh.each do |k,v|
118
+ out.puts ['H', k, v].join("\t")
119
+ end
120
+ plural.each do |k,vals|
121
+ vals.each do |val|
122
+ out.puts ['H', k, val].join("\t")
123
+ end
124
+ end
125
+
126
+ ##### SPECTRA
127
+ time_to_process = '0.0'
128
+ #########################################
129
+ # NEED TO FIGURE OUT: (in spectra guy)
130
+ # * Lowest Sp value for top 500 spectra
131
+ # * Number of sequences matching this precursor ion
132
+ #########################################
133
+
134
+ manual_validation_status = 'U'
135
+ self.out_files.zip(dta_files) do |out_file, dta_file|
136
+ # don't have the time to process (using 0.0 like bioworks 3.2)
137
+ dta_file_mh = dta_file.mh
138
+ out_file_total_inten = out_file.total_inten
139
+ out_file_lowest_sp = out_file.lowest_sp
140
+ if opt[:round]
141
+ dta_file_mh = dta_file_mh.round(mh_dp)
142
+ out_file_total_inten = out_file_total_inten.round(tic_dp)
143
+ out_file_lowest_sp = out_file_lowest_sp.round(sp_dp)
144
+ end
145
+
146
+ out.puts ['S', out_file.first_scan, out_file.last_scan, out_file.charge, time_to_process, out_file.computer, dta_file_mh, out_file_total_inten, out_file_lowest_sp, out_file.num_matched_peptides].join("\t")
147
+ out_file.hits.each_with_index do |hit,index|
148
+ hit_mh = hit.mh
149
+ hit_deltacn_orig_updated = hit.deltacn_orig_updated
150
+ hit_xcorr = hit.xcorr
151
+ hit_sp = hit.sp
152
+ if opt[:round]
153
+ hit_mh = hit_mh.round(mh_dp)
154
+ hit_deltacn_orig_updated = hit_deltacn_orig_updated.round(dcn_dp)
155
+ hit_xcorr = hit_xcorr.round(xcorr_dp)
156
+ hit_sp = hit_sp.round(sp_dp)
157
+ end
158
+ # note that the rank is determined by the order..
159
+ out.puts ['M', index+1, hit.rsp, hit_mh, hit_deltacn_orig_updated, hit_xcorr, hit_sp, hit.ions_matched, hit.ions_total, hit.sequence, manual_validation_status].join("\t")
160
+ hit.proteins.each do |prot|
161
+ out.puts ['L', prot.first_entry].join("\t")
162
+ end
163
+ end
164
+ end
165
+ end # close the filehandle
166
+ end # method
167
+ end # Sqt
168
+ include Sqt
169
+ end # Srf
170
+ end # Sequest
171
+ end # MS
172
+
173
+
174
+ require 'optparse'
175
+
176
+ module Mspire::Sequest::Srf::Sqt
177
+ def self.commandline(argv, progname=$0)
178
+ opt = {
179
+ :filter => true
180
+ }
181
+ opts = OptionParser.new do |op|
182
+ op.banner = "usage: #{progname} [OPTIONS] <file>.srf ..."
183
+ op.separator "output: <file>.sqt ..."
184
+ op.separator ""
185
+ op.separator "options:"
186
+ op.on("-d", "--db-info", "calculates num aa's and md5sum on db") {|v| opt[:db_info] = v }
187
+ op.on("-p", "--db-path <String>", "If you need to specify the database path") {|v| opt[:new_db_path] = v }
188
+ op.on("-u", "--db-update", "update the sqt file to reflect --db_path") {|v| opt[:db_update] = v }
189
+ op.on("-n", "--no-filter", "by default, pephit must be within peptide_mass_tolerance", "(defined in sequest.params) to be included. Turns this off.") { opt[:filter] = false }
190
+ op.on("-o", "--outfiles <first,...>", Array, "Comma list of output filenames") {|v| opt[:outfiles] = v }
191
+ op.on("-r", "--round", "round floating point values reasonably") {|v| opt[:round] = v }
192
+ end
193
+ opts.parse!(argv)
194
+
195
+ if argv.size == 0
196
+ puts(opts) || exit
197
+ end
198
+
199
+ if opt[:outfiles] && (opt[:outfiles].size != argv.size)
200
+ raise "if outfiles specified, outfiles must be same size as number of input files"
201
+ end
202
+
203
+ argv.each_with_index do |srf_file,i|
204
+ outfile =
205
+ if opt[:outfiles]
206
+ opt[:outfiles][i]
207
+ else
208
+ base = srf_file.chomp(File.extname(srf_file))
209
+ base + '.sqt'
210
+ end
211
+
212
+ srf = Mspire::Sequest::Srf.new(srf_file, :link_protein_hits => false, :filter_by_precursor_mass_tolerance => opt.delete(:filter))
213
+ srf.to_sqt(outfile, :db_info => opt[:db_info], :new_db_path => opt[:new_db_path], :update_db_path => opt[:db_update], :round => opt[:round])
214
+ end
215
+ end
216
+ end
217
+
218
+