mspire-sequest 0.2.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (40) hide show
  1. data/.autotest +30 -0
  2. data/.gitmodules +9 -0
  3. data/History +79 -0
  4. data/LICENSE +22 -0
  5. data/README.rdoc +85 -0
  6. data/Rakefile +52 -0
  7. data/VERSION +1 -0
  8. data/bin/srf_to_pepxml.rb +7 -0
  9. data/bin/srf_to_search.rb +7 -0
  10. data/bin/srf_to_sqt.rb +8 -0
  11. data/lib/mspire/sequest/params.rb +331 -0
  12. data/lib/mspire/sequest/pepxml/modifications.rb +247 -0
  13. data/lib/mspire/sequest/pepxml/params.rb +32 -0
  14. data/lib/mspire/sequest/sqt.rb +393 -0
  15. data/lib/mspire/sequest/srf/pepxml/sequest.rb +21 -0
  16. data/lib/mspire/sequest/srf/pepxml.rb +333 -0
  17. data/lib/mspire/sequest/srf/search.rb +158 -0
  18. data/lib/mspire/sequest/srf/sqt.rb +218 -0
  19. data/lib/mspire/sequest/srf.rb +715 -0
  20. data/lib/mspire/sequest.rb +6 -0
  21. data/script/fasta_ipi_to_ncbi-ish.rb +29 -0
  22. data/spec/mspire/sequest/params_spec.rb +135 -0
  23. data/spec/mspire/sequest/pepxml/modifications_spec.rb +50 -0
  24. data/spec/mspire/sequest/pepxml_spec.rb +311 -0
  25. data/spec/mspire/sequest/sqt_spec.rb +51 -0
  26. data/spec/mspire/sequest/sqt_spec_helper.rb +34 -0
  27. data/spec/mspire/sequest/srf/pepxml_spec.rb +89 -0
  28. data/spec/mspire/sequest/srf/search_spec.rb +131 -0
  29. data/spec/mspire/sequest/srf/sqt_spec.rb +228 -0
  30. data/spec/mspire/sequest/srf_spec.rb +113 -0
  31. data/spec/mspire/sequest/srf_spec_helper.rb +172 -0
  32. data/spec/spec_helper.rb +22 -0
  33. data/spec/testfiles/7MIX_STD_110802_1.sequest_params_fragment.srf +0 -0
  34. data/spec/testfiles/bioworks31.params +77 -0
  35. data/spec/testfiles/bioworks32.params +62 -0
  36. data/spec/testfiles/bioworks33.params +63 -0
  37. data/spec/testfiles/corrupted_900.srf +0 -0
  38. data/spec/testfiles/small.sqt +87 -0
  39. data/spec/testfiles/small2.sqt +176 -0
  40. metadata +185 -0
@@ -0,0 +1,218 @@
1
+ require 'mspire/sequest'
2
+ require 'mspire/sequest/srf'
3
+ require 'mspire/sequest/sqt'
4
+
5
+
6
+ module Mspire
7
+ module Sequest
8
+ class Srf
9
+
10
+ module Sqt
11
+
12
+ # the out_filename will be the base_name + .sqt unless 'out_filename' is
13
+ # defined
14
+ # :round => round floating point numbers
15
+ # etc...
16
+ def to_sqt(out_filename=nil, opts={})
17
+ # default rounding precision (Decimal Places)
18
+ tic_dp = 2
19
+ mh_dp = 7
20
+ xcorr_dp = 5
21
+ sp_dp = 2
22
+ dcn_dp = 5
23
+
24
+ defaults = {:db_info=>false, :new_db_path=>nil, :update_db_path=>false, :round=>false}
25
+ opt = defaults.merge(opts)
26
+
27
+ outfile =
28
+ if out_filename
29
+ out_filename
30
+ else
31
+ base_name + '.sqt'
32
+ end
33
+ invariant_ordering = %w(SQTGenerator SQTGeneratorVersion Database FragmentMasses PrecursorMasses StartTime) # just for readability and consistency
34
+ fmt =
35
+ if params.fragment_mass_type == 'average' ; 'AVG'
36
+ else ; 'MONO'
37
+ end
38
+ pmt =
39
+ if params.precursor_mass_type == 'average' ; 'AVG'
40
+ else ; 'MONO'
41
+ end
42
+
43
+ mass_index = params.mass_index
44
+ static_mods = params.static_mods.map do |k,v|
45
+ key = k.split(/_/)[1]
46
+ if key.size == 1
47
+ key + '=' + (mass_index[key] + v.to_f).to_s
48
+ else
49
+ key + '=' + v
50
+ end
51
+ end
52
+
53
+ dynamic_mods = []
54
+ header.modifications.scan(/\((.*?)\)/) do |match|
55
+ dynamic_mods << match.first.sub(/ /,'=')
56
+ end
57
+ plural = {
58
+ 'StaticMod' => static_mods,
59
+ 'DynamicMod' => dynamic_mods, # example as diff mod
60
+ 'Comment' => ['Created from Bioworks .srf file']
61
+ }
62
+
63
+ db_filename = header.db_filename.sub(/\.hdr$/, '') # remove the .hdr postfix
64
+ db_filename_in_sqt = db_filename
65
+ if opt[:new_db_path]
66
+ db_filename = File.join(opt[:new_db_path], File.basename(db_filename.gsub('\\', '/')))
67
+ if opt[:update_db_path]
68
+ db_filename_in_sqt = File.expand_path(db_filename)
69
+ warn "writing Database #{db_filename} to sqt, but it does not exist on this file system" unless File.exist?(db_filename)
70
+ end
71
+ end
72
+
73
+ apmu =
74
+ case params.peptide_mass_units
75
+ when '0' ; 'amu'
76
+ when '1' ; 'mmu'
77
+ when '2' ; 'ppm'
78
+ end
79
+
80
+ hh = {
81
+ 'SQTGenerator' => "mspire: ms-sequest",
82
+ 'SQTGeneratorVersion' => Mspire::Sequest::VERSION,
83
+ 'Database' => db_filename_in_sqt,
84
+ 'FragmentMasses' => fmt,
85
+ 'PrecursorMasses' => pmt,
86
+ 'StartTime' => '', # Bioworks 3.2 also leaves this blank...
87
+ 'Alg-PreMassTol' => params.peptide_mass_tolerance,
88
+ 'Alg-FragMassTol' => params.fragment_ion_tolerance,
89
+ 'Alg-PreMassUnits' => apmu, ## mine
90
+ 'Alg-IonSeries' => header.ion_series.split(':').last.lstrip,
91
+ 'Alg-Enzyme' => header.enzyme.split(':').last,
92
+ 'Alg-MSModel' => header.model,
93
+ }
94
+
95
+ if opt[:db_info]
96
+ if File.exist?(db_filename)
97
+ reply = Mspire::Sequest::Sqt.db_info(db_filename)
98
+ %w(DBSeqLength DBLocusCount DBMD5Sum).zip(reply) do |label,val|
99
+ hh[label] = val
100
+ end
101
+ else
102
+ warn "file #{db_filename} does not exist, no extra db info in header!"
103
+ end
104
+ end
105
+
106
+ has_hits = (self.out_files.size > 0)
107
+ if has_hits
108
+ # somewhat redundant with above, but we can get this without a db present!
109
+ hh['DBLocusCount'] = self.out_files.first.db_locus_count
110
+ end
111
+
112
+ File.open(outfile, 'w') do |out|
113
+ # print the header:
114
+ invariant_ordering.each do |iv|
115
+ out.puts ['H', iv, hh.delete(iv)].join("\t")
116
+ end
117
+ hh.each do |k,v|
118
+ out.puts ['H', k, v].join("\t")
119
+ end
120
+ plural.each do |k,vals|
121
+ vals.each do |val|
122
+ out.puts ['H', k, val].join("\t")
123
+ end
124
+ end
125
+
126
+ ##### SPECTRA
127
+ time_to_process = '0.0'
128
+ #########################################
129
+ # NEED TO FIGURE OUT: (in spectra guy)
130
+ # * Lowest Sp value for top 500 spectra
131
+ # * Number of sequences matching this precursor ion
132
+ #########################################
133
+
134
+ manual_validation_status = 'U'
135
+ self.out_files.zip(dta_files) do |out_file, dta_file|
136
+ # don't have the time to process (using 0.0 like bioworks 3.2)
137
+ dta_file_mh = dta_file.mh
138
+ out_file_total_inten = out_file.total_inten
139
+ out_file_lowest_sp = out_file.lowest_sp
140
+ if opt[:round]
141
+ dta_file_mh = dta_file_mh.round(mh_dp)
142
+ out_file_total_inten = out_file_total_inten.round(tic_dp)
143
+ out_file_lowest_sp = out_file_lowest_sp.round(sp_dp)
144
+ end
145
+
146
+ out.puts ['S', out_file.first_scan, out_file.last_scan, out_file.charge, time_to_process, out_file.computer, dta_file_mh, out_file_total_inten, out_file_lowest_sp, out_file.num_matched_peptides].join("\t")
147
+ out_file.hits.each_with_index do |hit,index|
148
+ hit_mh = hit.mh
149
+ hit_deltacn_orig_updated = hit.deltacn_orig_updated
150
+ hit_xcorr = hit.xcorr
151
+ hit_sp = hit.sp
152
+ if opt[:round]
153
+ hit_mh = hit_mh.round(mh_dp)
154
+ hit_deltacn_orig_updated = hit_deltacn_orig_updated.round(dcn_dp)
155
+ hit_xcorr = hit_xcorr.round(xcorr_dp)
156
+ hit_sp = hit_sp.round(sp_dp)
157
+ end
158
+ # note that the rank is determined by the order..
159
+ out.puts ['M', index+1, hit.rsp, hit_mh, hit_deltacn_orig_updated, hit_xcorr, hit_sp, hit.ions_matched, hit.ions_total, hit.sequence, manual_validation_status].join("\t")
160
+ hit.proteins.each do |prot|
161
+ out.puts ['L', prot.first_entry].join("\t")
162
+ end
163
+ end
164
+ end
165
+ end # close the filehandle
166
+ end # method
167
+ end # Sqt
168
+ include Sqt
169
+ end # Srf
170
+ end # Sequest
171
+ end # MS
172
+
173
+
174
+ require 'optparse'
175
+
176
+ module Mspire::Sequest::Srf::Sqt
177
+ def self.commandline(argv, progname=$0)
178
+ opt = {
179
+ :filter => true
180
+ }
181
+ opts = OptionParser.new do |op|
182
+ op.banner = "usage: #{progname} [OPTIONS] <file>.srf ..."
183
+ op.separator "output: <file>.sqt ..."
184
+ op.separator ""
185
+ op.separator "options:"
186
+ op.on("-d", "--db-info", "calculates num aa's and md5sum on db") {|v| opt[:db_info] = v }
187
+ op.on("-p", "--db-path <String>", "If you need to specify the database path") {|v| opt[:new_db_path] = v }
188
+ op.on("-u", "--db-update", "update the sqt file to reflect --db_path") {|v| opt[:db_update] = v }
189
+ op.on("-n", "--no-filter", "by default, pephit must be within peptide_mass_tolerance", "(defined in sequest.params) to be included. Turns this off.") { opt[:filter] = false }
190
+ op.on("-o", "--outfiles <first,...>", Array, "Comma list of output filenames") {|v| opt[:outfiles] = v }
191
+ op.on("-r", "--round", "round floating point values reasonably") {|v| opt[:round] = v }
192
+ end
193
+ opts.parse!(argv)
194
+
195
+ if argv.size == 0
196
+ puts(opts) || exit
197
+ end
198
+
199
+ if opt[:outfiles] && (opt[:outfiles].size != argv.size)
200
+ raise "if outfiles specified, outfiles must be same size as number of input files"
201
+ end
202
+
203
+ argv.each_with_index do |srf_file,i|
204
+ outfile =
205
+ if opt[:outfiles]
206
+ opt[:outfiles][i]
207
+ else
208
+ base = srf_file.chomp(File.extname(srf_file))
209
+ base + '.sqt'
210
+ end
211
+
212
+ srf = Mspire::Sequest::Srf.new(srf_file, :link_protein_hits => false, :filter_by_precursor_mass_tolerance => opt.delete(:filter))
213
+ srf.to_sqt(outfile, :db_info => opt[:db_info], :new_db_path => opt[:new_db_path], :update_db_path => opt[:db_update], :round => opt[:round])
214
+ end
215
+ end
216
+ end
217
+
218
+