ms-sequest 0.0.11 → 0.0.12

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,14 @@
1
+ # -*- ruby -*-
2
+
3
+ Autotest.add_hook :initialize do |at|
4
+ at.clear_mappings
5
+ end
6
+
7
+ Autotest.add_hook :initialize do |at|
8
+ at.add_mapping(%r%^lib/(.*)\.rb$%) { |_, m|
9
+ #["spec/#{m[1]}_spec.rb"]
10
+ #["test/#{m[1]}_test.rb"]
11
+ ## for both specs and tests:
12
+ ["spec/#{m[1]}_spec.rb","test/#{m[1]}_test.rb"]
13
+ }
14
+ end
@@ -0,0 +1,8 @@
1
+ .DS_Store
2
+ pkg/
3
+ rdoc/
4
+ backup/
5
+ config/
6
+ data/
7
+ *.swp
8
+ *.gemspec
@@ -0,0 +1,9 @@
1
+ [submodule "submodule/ms-testdata"]
2
+ path = submodule/ms-testdata
3
+ url = git://github.com/bahuvrihi/ms-testdata.git
4
+ [submodule "submodule/ms-in_silico"]
5
+ path = submodule/ms-in_silico
6
+ url = git://github.com/bahuvrihi/ms-in_silico.git
7
+ [submodule "submodule/tap-mechanize"]
8
+ path = submodule/tap-mechanize
9
+ url = git://github.com/bahuvrihi/tap-mechanize.git
data/History CHANGED
@@ -1,3 +1,11 @@
1
+ == 0.0.12 / 2010-01-01
2
+
3
+ * moved over to jeweler and tests to bacon (spec/more)
4
+
5
+ == 0.0.11 / 2010-01-01
6
+
7
+ * peptides have sf value (read from srf file)
8
+
1
9
  == 0.0.10 / 2009-12-03
2
10
 
3
11
  * turned off warning if print_duplicates == 0
@@ -1,6 +1,7 @@
1
1
  Copyright shared among contributing institutions:
2
2
  Copyright (c) 2006-2008 University of Texas at Austin (the initial project)
3
3
  Copyright (c) 2009 Regents of the University of Colorado and Howard Hughes Medical Institute. (modularization of the project)
4
+ Author: John T. Prince under direction of Edward Marcotte and Natalie Ahn
4
5
 
5
6
  Permission is hereby granted, free of charge, to any person obtaining a copy
6
7
  of this software and associated documentation files (the "Software"), to deal
@@ -0,0 +1,77 @@
1
+ = {ms-sequest}[http://jtprince.github.com/ms-template/rdoc/]
2
+
3
+ An {mspire}[http://mspire.rubyforge.org] library supporting SEQUEST, Bioworks, SQT and associated formats.
4
+
5
+ == Examples
6
+
7
+ === Ms::Sequest::Srf
8
+
9
+ Can read and convert Bioworks Sequest Results Files (SRF).
10
+
11
+ require 'ms/sequest/srf'
12
+ srf = Ms::Sequest::Srf.new("file.srf")
13
+
14
+ Conversions (see api for options):
15
+
16
+ srf.to_sqt # (outputs a file) -> file.sqt
17
+
18
+ srf.to_mgf # (outputs a file) -> file.mgf
19
+ srf.to_dta # (outputs a dir) -> file
20
+ srf.to_dta("file.tgz", :tgz) # on the fly tgz (requires archive-tar-minitar)
21
+
22
+ Object access (see Ms::Sequest::Srf for much more):
23
+
24
+ srf.header # Ms::Sequest::Srf::Header object
25
+ srf.params # Ms::Sequest::Params object
26
+ srf.dta_files # Ms::Sequest::Srf::DTA objects
27
+ srf.peps # Ms::Sequest::Srf::Out::Pep objects
28
+ srf.prots # Ms::Sequest::Srf::Out::Prot objects
29
+
30
+ === Ms::Sequest::Params
31
+
32
+ Object or hash access to any parameter in the file. Also provides a unified interface across several versions (3.1 - 3.3)
33
+
34
+ require 'ms/sequest/params'
35
+ params = Ms::Sequest::Params.new("sequest.params")
36
+ params.any_existing_param # -> some value or empty string if no value
37
+ params['any_existing_param'] # -> some value or empty string if no value
38
+ params.non_existent_param # -> nil
39
+
40
+ # some unified interace methods:
41
+ params.enzyme # -> enzyme name with no parentheses
42
+ params.database # -> first_database_name
43
+ params.enzyme_specificity # -> [offset, cleave_at, expect_if_after]
44
+ params.precursor_mass_type # => "average" | "monoisotopic"
45
+ params.fragment_mass_type # => "average" | "monoisotopic"
46
+
47
+ === Ms::Sequest::Sqt
48
+
49
+ sqt = Ms::Sequest::Sqt.new("file.sqt")
50
+ sqt.header
51
+ sqt.spectra.each do |spectrum| # an Ms::Sequest::Sqt::Spectrum object
52
+ spectrum.matches.each do |match| # an Ms::Sequest::Sqt::Match object
53
+ match.loci.each do |locus| # an Ms::Sequest::Sqt::Locus object
54
+ end
55
+ end
56
+ end
57
+
58
+ # or more direct access to Match and Locus objects:
59
+ sqt.peps
60
+ sqt.prots
61
+
62
+ Also reads Percolator SQT output files intelligently:
63
+
64
+ psqt = Ms::Sequest::Sqt.new("percolator_output.sqt")
65
+ psqt.peps.each do |pmatch|
66
+ pmatch.percolator_score == pmatch.xcorr
67
+ pmatch.negative_q_value == pmatch.sp
68
+ pmatch.q_value == -pmatch.negative_q_value
69
+ end
70
+
71
+ == Installation
72
+
73
+ gem install ms-sequest
74
+
75
+ == Copyright
76
+
77
+ See LICENSE (MIT)
@@ -0,0 +1,110 @@
1
+
2
+ require 'rubygems'
3
+ require 'rake'
4
+ require 'jeweler'
5
+ require 'rake/testtask'
6
+ require 'rcov/rcovtask'
7
+
8
+ NAME = "ms-sequest"
9
+ WEBSITE_BASE = "website"
10
+ WEBSITE_OUTPUT = WEBSITE_BASE + "/output"
11
+
12
+ gemspec = Gem::Specification.new do |s|
13
+ s.name = NAME
14
+ s.authors = ["John T. Prince"]
15
+ s.email = "jtprince@gmail.com"
16
+ s.homepage = "http://jtprince.github.com/" + NAME
17
+ s.summary = "An mspire library supporting SEQUEST, Bioworks, SQT, etc"
18
+ s.description = "reads .SRF, .SQT and supports conversions"
19
+ s.rubyforge_project = 'mspire'
20
+
21
+ s.add_dependency("arrayclass", ">= 0.1.0")
22
+ s.add_dependency("ms-core", ">= 0.0.2")
23
+ s.add_dependency("tap", ">= 0.17.1")
24
+ s.add_dependency("ms-fasta", ">= 0.2.3")
25
+
26
+ s.add_development_dependency("ms-testdata", ">= 0.18.0")
27
+ s.add_development_dependency("spec/more")
28
+ end
29
+
30
+ Jeweler::Tasks.new(gemspec)
31
+
32
+ Rake::TestTask.new(:spec) do |t|
33
+ t.libs << 'lib' << 'spec'
34
+ t.pattern = 'spec/**/*_spec.rb'
35
+ t.verbose = true
36
+ unless ENV['gems']
37
+ t.libs << 'submodule/ms-testdata/lib'
38
+ #t.libs << 'submodule/ms-in_silico/lib'
39
+ #t.libs << 'submodule/tap-mechanize/lib'
40
+ end
41
+ end
42
+
43
+ Rcov::RcovTask.new do |spec|
44
+ spec.libs << 'spec'
45
+ spec.pattern = 'spec/**/*_spec.rb'
46
+ spec.verbose = true
47
+ end
48
+
49
+
50
+ def rdoc_redirect(base_rdoc_output_dir, package_website_page, version)
51
+ content = %Q{
52
+ <!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.0 Transitional//EN">
53
+ <html><head><title>mspire: } + NAME + %Q{rdoc</title>
54
+ <meta http-equiv="REFRESH" content="0;url=#{package_website_page}/rdoc/#{version}/">
55
+ </head> </html>
56
+ }
57
+ FileUtils.mkpath(base_rdoc_output_dir)
58
+ File.open("#{base_rdoc_output_dir}/index.html", 'w') {|out| out.print content }
59
+ end
60
+
61
+ require 'rake/rdoctask'
62
+ Rake::RDocTask.new do |rdoc|
63
+ base_rdoc_output_dir = WEBSITE_OUTPUT + '/rdoc'
64
+ version = File.read('VERSION')
65
+ rdoc.rdoc_dir = base_rdoc_output_dir + "/#{version}"
66
+ rdoc.title = NAME + ' ' + version
67
+ rdoc.rdoc_files.include('README*')
68
+ rdoc.rdoc_files.include('lib/**/*.rb')
69
+ end
70
+
71
+ task :create_redirect do
72
+ base_rdoc_output_dir = WEBSITE_OUTPUT + '/rdoc'
73
+ rdoc_redirect(base_rdoc_output_dir, gemspec.homepage,version)
74
+ end
75
+
76
+ task :rdoc => :create_redirect
77
+
78
+ namespace :website do
79
+ desc "checkout and configure the gh-pages submodule"
80
+ task :init do
81
+ if File.exist?(WEBSITE_OUTPUT + "/.git")
82
+ puts "!! not doing anything, #{WEBSITE_OUTPUT + "/.git"} already exists !!"
83
+ else
84
+
85
+ puts "(not sure why this won't work programmatically)"
86
+ puts "################################################"
87
+ puts "[Execute these commands]"
88
+ puts "################################################"
89
+ puts "git submodule init"
90
+ puts "git submodule update"
91
+ puts "pushd #{WEBSITE_OUTPUT}"
92
+ puts "git co --track -b gh-pages origin/gh-pages ;"
93
+ puts "popd"
94
+ puts "################################################"
95
+
96
+ # not sure why this won't work!
97
+ #%x{git submodule init}
98
+ #%x{git submodule update}
99
+ #Dir.chdir(WEBSITE_OUTPUT) do
100
+ # %x{git co --track -b gh-pages origin/gh-pages ;}
101
+ #end
102
+ end
103
+ end
104
+ end
105
+
106
+ task :default => :spec
107
+
108
+ task :build => :gemspec
109
+
110
+ # credit: Rakefile modeled after Jeweler's
data/VERSION ADDED
@@ -0,0 +1 @@
1
+ 0.0.12
@@ -1,6 +1,6 @@
1
1
 
2
2
  module Ms
3
3
  module Sequest
4
- VERSION = '0.0.11'
4
+ VERSION = File.open(File.dirname(__FILE__) + '/../../VERSION') {|io| io.gets.chomp }
5
5
  end
6
6
  end
@@ -0,0 +1,498 @@
1
+
2
+
3
+ require 'sample_enzyme'
4
+ require 'xmlparser'
5
+ require 'spec_id'
6
+ require 'zlib'
7
+ require 'hash_by'
8
+ require 'arrayclass'
9
+ require 'fasta'
10
+
11
+ ## have to pre-declare some guys
12
+ module ProteinReferenceable; end
13
+ module SpecID; end
14
+ module SpecID::Prot; end
15
+ module SpecID::Pep; end
16
+ module SpecIDXML; end
17
+
18
+ # For dealing with Bioworks .xml format
19
+ class Bioworks
20
+ include SpecID
21
+
22
+ # Regular expressions
23
+ @@bioworksinfo_re = /<bioworksinfo>(.*)<\/bioworksinfo>/o
24
+ @@modifications_re = /<modifications>(.*)<\/modifications>/o
25
+ @@protein_re = /<protein>/o
26
+ @@origfilename_re = /<origfilename>(.*)<\/origfilename>/o
27
+ @@origfilepath_re = /<origfilepath>(.*)<\/origfilepath>/o
28
+
29
+
30
+ attr_accessor :peps, :prots, :version, :global_filename, :origfilename, :origfilepath
31
+ # a string of modifications e.g., "(M* +15.99491) (S@ +14.9322) "
32
+ attr_accessor :modifications
33
+
34
+ def hi_prob_best ; false end
35
+
36
+ # -> prints to file filename1.sqt, filename2.sqt
37
+ # @TODO: sqt file output
38
+ def to_sqt(params_file)
39
+ ## hash peps by filename
40
+ ## hash prots by peptide
41
+ end
42
+
43
+ # returns the number of prots. Raises an Exception if open and closing xml
44
+ # tags don't agree
45
+ def num_prots(file)
46
+ re = /(<protein>)|(<\/protein>)/mo
47
+ begin_tags = 0
48
+ end_tags = 0
49
+ IO.read(file).scan(re) do |match|
50
+ if match.first
51
+ begin_tags += 1
52
+ else
53
+ end_tags += 1
54
+ end
55
+ end
56
+ if begin_tags != end_tags
57
+ puts "WARNING: #{file} doesn't have matching closing tags"
58
+ puts "for the <protein> tag. Returning # of beginning tags."
59
+ end
60
+ begin_tags
61
+ end
62
+
63
+
64
+
65
+ # Outputs the bioworks browser excel format (tab delimited) to file.
66
+ # Useful if you have more than ~65,000 lines (can export bioworks.xml
67
+ # and then convert to excel format).
68
+ # Currently, the only things not precisely identical are:
69
+ # 1. The peptide hit counts (although the first number [total # peptides] is accurate)
70
+ # 2. The precise ordering of peptides within each protein. When dealing with output from multiple runs, peptides with runs with exactly the same scan numbers are not guaranteed to be in the same order.
71
+ def to_excel(file)
72
+ update_peptide_hit_counts
73
+ arr = []
74
+ arr << ['', 'Reference', '', '', '', 'Score', 'Coverage', 'MW', 'Accession', 'Peptide (Hits)', '', ' ']
75
+ arr << ['', '"File, Scan(s)"', 'Peptide', 'MH+', 'z', 'XC', 'DeltaCn', 'Sp', 'RSp', 'Ions', 'Count', ' ']
76
+ @prots.each_with_index do |prot,index|
77
+ line_arr = prot.get(:consensus_score, :coverage, :weight, :accession)
78
+ if line_arr[1] == "0.0" then line_arr[1] = "" end
79
+ line_arr.unshift('', '', '')
80
+ line_arr.unshift('"' + prot.reference.split('|')[-1] + '"')
81
+ line_arr.unshift(index+1)
82
+ pep_hit_counts = prot.peptide_hit_counts
83
+ pep_hit_counts_string = pep_hit_counts[0].to_s + ' (' + pep_hit_counts[1..-1].join(" ") + ')'
84
+ line_arr.push( pep_hit_counts_string )
85
+ line_arr.push("")
86
+ line_arr.push(" ")
87
+ arr.push( line_arr )
88
+ prot.peps.sort_by{|obj| [obj.first_scan.to_i, obj.last_scan.to_i] }.each do |pep|
89
+
90
+ pep_arr = pep.get(:sequence, :mass, :charge, :xcorr, :deltacn, :sp, :rsp, :ions)
91
+ count = pep.count
92
+ if count == '0' then count = "" end
93
+ pep_arr.push(count)
94
+ pep_arr.push(' ')
95
+ pep_arr.unshift('"' + pep.file + '"')
96
+ pep_arr.unshift( '' )
97
+ arr.push( pep_arr )
98
+ end
99
+ end
100
+ File.open(file, "w") do |out|
101
+ arr.each do |line|
102
+ out.print(line.join("\t"), "\n")
103
+ end
104
+ end
105
+
106
+ end
107
+
108
+ # for output to excel format or other things, updates each protein
109
+ # with a peptide hit count array based on ranking of xcorr per dta file
110
+ # where each array is the total number of peptide hits, then rank 1,2,3,4,5
111
+ # @TODO: Can't get this to check out yet. Perhaps they use normalized
112
+ # Xcorr?
113
+ def update_peptide_hit_counts
114
+ @prots.each do |prot|
115
+ prot.peptide_hit_counts[0] = prot.peps.size
116
+ end
117
+ hash = peps.hash_by(:file)
118
+ hash.sort.each do |k,v|
119
+ sorted = v.sort_by {|obj| obj.xcorr.to_f }
120
+ peps, prot_groups = _uniq_peps_by_sequence_charge(sorted) ## but not on prot!!!!!uniq_peps_by_sequence_charge!
121
+
122
+ prot_groups.each_with_index do |prot_group, i|
123
+ prot_group.each do |prot|
124
+ prot.peptide_hit_counts[i+1] += 1 if prot.peptide_hit_counts[i+1]
125
+ end
126
+ end
127
+ end
128
+ end
129
+
130
+ # returns (peptides, proteins) where peptides is the unique list of peps
131
+ # and proteins is a parallel array of arrays of represented proteins
132
+ # note that each pep will contain its original prot it belongs to, even
133
+ # though the parallel protein actually represents the proteins it belongs
134
+ # to.
135
+ # assumes that each peptide points to all its proteins in pep.prots
136
+ def _uniq_peps_by_sequence_charge(peps)
137
+ new_arr = []
138
+ prot_arr = []
139
+ index_accounted_for = []
140
+ (0...peps.size).each do |i|
141
+ next if index_accounted_for.include?(i)
142
+ new_arr << peps[i]
143
+ prot_arr.push( peps[i].prots )
144
+ ((i+1)...peps.size).each do |j|
145
+ pep1, pep2 = peps[i], peps[j]
146
+ if pep1.sequence == pep2.sequence && pep1.charge == pep2.charge
147
+ prot_arr.last.push( *(pep2.prots) )
148
+ index_accounted_for << j
149
+ end
150
+ end
151
+ end
152
+ return new_arr, prot_arr
153
+ end
154
+
155
+ def initialize(file=nil)
156
+ @peps = nil
157
+ if file
158
+ @filename = file
159
+ parse_xml(file)
160
+ #parse_xml_by_xmlparser(file)
161
+ end
162
+ end
163
+
164
+ def parse_xml_by_xmlparser(file)
165
+ parser = Bioworks::XMLParser.new
166
+ File.open(file) do |fh|
167
+ #3.times do fh.gets end ## TEMPFIX
168
+ parser.parse(fh)
169
+ end
170
+ #puts "ETETWSST"
171
+ #p parser.prots
172
+ @prots = parser.prots
173
+ end
174
+
175
+ # This is highly specific to Bioworks 3.2 xml export. In other words,
176
+ # unless the newlines, etc. are duplicated, this parser will fail! Not
177
+ # robust, but it is faster than xmlparser (which is based on the speedy
178
+ # expat)
179
+ def parse_xml(file)
180
+ fh = nil
181
+ if file =~ /\.gz$/
182
+ fh = Zlib::GzipReader.open(file)
183
+ else
184
+ fh = File.open(file)
185
+ end
186
+ @origfilename = get_regex_val(fh, @@origfilename_re)
187
+ @origfilepath = get_regex_val(fh, @@origfilepath_re)
188
+ if @origfilename
189
+ @global_filename = @origfilename.gsub(File.extname(@origfilename), "")
190
+ end
191
+ @version = get_regex_val(fh, @@bioworksinfo_re)
192
+ @modifications = get_regex_val(fh, @@modifications_re)
193
+ @prots, @peps = get_prots_from_xml_stream(fh)
194
+ fh.close
195
+ end
196
+
197
+ ## returns proteins and peptides
198
+ def get_prots_from_xml_stream(fh)
199
+ uniq_pephit_hash = {}
200
+ prots = []
201
+ while line = fh.gets
202
+ if line =~ @@protein_re
203
+ prot = Bioworks::Prot.new
204
+ prot.bioworks = self
205
+ prot.set_from_xml_stream(fh, uniq_pephit_hash)
206
+ prots << prot
207
+ end
208
+ end
209
+ [prots, uniq_pephit_hash.values]
210
+ end
211
+
212
+ # gets the regex and stops (and rewinds if it hits a protein)
213
+ # if no regex is found, returns nil and rewinds the filehandle
214
+ def get_regex_val(fh, regex)
215
+ ver = nil
216
+ last_pos = fh.pos
217
+ while line = fh.gets
218
+ if line =~ regex
219
+ ver = $1.dup
220
+ break
221
+ elsif line =~ @@protein_re
222
+ fh.seek last_pos
223
+ break
224
+ end
225
+ last_pos = fh.pos
226
+ end
227
+ unless ver then fh.rewind end
228
+ ver
229
+ end
230
+
231
+ # Outputs sequest xml files (pepxml) for the trans-proteomics pipeline
232
+ def to_pepxml
233
+ string = xml_version
234
+ string
235
+ end
236
+
237
+ end
238
+
239
+ # Implements fast parsing via XMLParser (wrapper around Expat)
240
+ # It is actually slower (about %25 slower) than regular expression parsing
241
+ class Bioworks::XMLParser < XMLParser
242
+ @@at = '@'
243
+ attr_accessor :prots
244
+
245
+ def initialize
246
+ @current_obj = nil
247
+ @current_hash = {}
248
+ @current_name = nil
249
+ @current_data = nil
250
+ @prots = []
251
+ end
252
+
253
+ def startElement(name, attrs)
254
+ case name
255
+ when "peptide"
256
+ curr_prot = @current_obj
257
+ if @current_obj.class == Bioworks::Prot
258
+ @current_obj.set_from_xml_hash_xmlparser(@current_hash)
259
+ else
260
+ curr_prot = @current_obj.prot ## unless previous was a peptide
261
+ end
262
+ peptide = Bioworks::Pep.new
263
+ peptide.prot = curr_prot
264
+ curr_prot.peps << peptide
265
+ @current_obj = peptide
266
+ @current_hash = {}
267
+ when "protein"
268
+ @current_obj = Bioworks::Prot.new
269
+ @current_hash = {}
270
+ @prots << @current_obj
271
+ else
272
+ @current_name = name
273
+ end
274
+ end
275
+
276
+ def endElement(name)
277
+ case name
278
+ when "peptide"
279
+ @current_obj.set_from_hash_given_text(@current_hash)
280
+ when "protein"
281
+ else
282
+ @current_hash[name] = @current_data
283
+ end
284
+ end
285
+
286
+ def character(data)
287
+ @current_data = data
288
+ end
289
+
290
+ end
291
+
292
+ module Bioworks::XML
293
+ # The regular expression to grab attributes from the bioworks xml format
294
+ @@att_re = /<([\w]+)>(.*)<\/[\w]+>/o
295
+ end
296
+
297
+ class Bioworks::Prot
298
+ include ProteinReferenceable
299
+ include SpecID::Prot
300
+ include Bioworks::XML
301
+
302
+ @@end_prot_re = /<\/protein>/o
303
+ @@pep_re = /<peptide>/o
304
+ @@atts = %w(reference protein_probability consensus_score sf unified_score coverage pi weight accession peps)
305
+ attr_accessor :reference, :protein_probability, :consensus_score, :sf, :unified_score, :coverage, :pi, :weight, :accession, :peps, :bioworks, :peptide_hit_counts
306
+
307
+ def initialize
308
+ @peps = []
309
+ @peptide_hit_counts = [0,0,0,0,0,0]
310
+ end
311
+
312
+
313
+ # returns array of values of the attributes given (as symbols)
314
+ def get(*args)
315
+ args.collect do |arg|
316
+ send(arg)
317
+ end
318
+ end
319
+
320
+ def set_from_xml_stream(fh, uniq_pephit_hash)
321
+ hash = {}
322
+ @peps = []
323
+ while line = fh.gets
324
+ if line =~ @@att_re
325
+ hash[$1] = $2
326
+ elsif line =~ @@pep_re
327
+ ## Could do a look ahead to grab the file and sequence to check
328
+ ## uniqueness to increase speed here.
329
+ pep = Bioworks::Pep.new.set_from_xml_stream(fh)
330
+ # normal search results files have a global filename
331
+ # while multi-consensus do not
332
+ pep[12] ||= bioworks.global_filename
333
+
334
+ ## figure out uniqueness
335
+ ky = [pep.base_name, pep.first_scan, pep.charge, pep.sequence]
336
+ if uniq_pephit_hash.key? ky
337
+ pep = uniq_pephit_hash[ky]
338
+ else
339
+ ## insert the new protein
340
+ pep.prots = []
341
+ uniq_pephit_hash[ky] = pep
342
+ end
343
+ pep.prots << self
344
+ @peps << pep
345
+
346
+ elsif line =~ @@end_prot_re
347
+ set_from_xml_hash(hash)
348
+ break
349
+ else
350
+ puts "Bad parsing on: #{line}"
351
+ puts "EXITING!"
352
+ exit
353
+ end
354
+ end
355
+ self
356
+ end
357
+
358
+ def set_from_xml_hash_xmlparser(hash)
359
+ hash.delete("sequestresults")
360
+ hash.delete("bioworksinfo")
361
+ hash["sf"] = hash.delete("Sf")
362
+ hash["pi"] = hash.delete("pI")
363
+ set_from_xml_hash(hash)
364
+ end
365
+
366
+ # changes the sf to Sf and pI to pi
367
+ def set_from_xml_hash(hash)
368
+ @reference = hash["reference"]
369
+ @protein_probability = hash["protein_probability"].to_f
370
+ #@probability = @protein_probability.to_f
371
+ @consensus_score = hash["consensus_score"].to_f
372
+ @sf = hash["Sf"].to_f
373
+ @unified_score = hash["unified_score"].to_f
374
+ @coverage = hash["coverage"].to_f
375
+ @pi = hash["pI"].to_f
376
+ @weight = hash["weight"].to_f
377
+ @accession = hash["accession"]
378
+ end
379
+ end
380
+
381
+ Bioworks::Pep = Arrayclass.new( %w(sequence mass deltamass charge xcorr deltacn sp rsp ions count tic prots base_name first_scan last_scan peptide_probability file _num_prots _first_prot aaseq) )
382
+ # 0=sequence 1=mass 2=deltamass 3=charge 4=xcorr 5=deltacn 6=sp 7=rsp 8=ions 9=count 10=tic 11=prots 12=base_name 13=first_scan 14=last_scan 15=peptide_probability 16=file 17=_num_prots 18=_first_prot 19=aaseq
383
+
384
+ class Bioworks::Pep
385
+ include SpecID::Pep
386
+ include Bioworks::XML
387
+ include SpecIDXML
388
+
389
+ @@file_split_first_re = /, /o
390
+ @@file_split_second_re = / - /o
391
+ #@@att_re = /<(.*)>(.*)<\/(.*)>/
392
+ @@end_pep_re = /<\/peptide>/o
393
+ @@file_one_scan_re = /(.*), (\d+)/o
394
+ @@file_mult_scan_re = /(.*), (\d+) - (\d+)/o
395
+ ## NOTE! the mass is really the theoretical MH+!!!!
396
+ ## NOTE! ALL values stored as strings, except peptide_probability!
397
+
398
+ #ions is a string 'x/y'
399
+
400
+ ## other accessors:
401
+ def probability ; self[15] end
402
+ def mh ; self[1] end
403
+
404
+ # This is not a true ppm since it should be divided by the actual mh instead
405
+ # of the theoretical (but it is as close as we can get for this object)
406
+ def ppm
407
+ 1.0e6 * (self[2].abs/self[1])
408
+ #1.0e6 * (self.deltamass.abs/self.mh)
409
+ end
410
+
411
+ # returns array of values of the attributes given (as symbols)
412
+ def get(*args)
413
+ args.collect do |arg|
414
+ send(arg)
415
+ end
416
+ end
417
+
418
+
419
+
420
+
421
+ #def peptide_probability=(prob)
422
+ # @peptide_probability = prob.to_f
423
+ #end
424
+
425
+ # takes arguments in one of two forms:
426
+ # 1. file, first_scan[ - last_scan]
427
+ # 2. scan[ - last_scan]
428
+ # returns base_name, first_scan, last_scan
429
+ # base_name will be set for #1, nil for #2
430
+ def self.extract_file_info(arg)
431
+ last_scan = nil
432
+ (base_name, first_scan) = arg.split(@@file_split_first_re)
433
+ unless first_scan
434
+ first_scan = base_name
435
+ base_name = nil
436
+ end
437
+ first_scan = first_scan.split(@@file_split_second_re)
438
+ if first_scan.size > 1
439
+ (first_scan, last_scan) = first_scan
440
+ else
441
+ first_scan = first_scan[0]
442
+ last_scan = first_scan
443
+ end
444
+ [base_name, first_scan, last_scan]
445
+ end
446
+
447
+ tmp_verb = $VERBOSE
448
+ $VERBOSE = nil
449
+ def file=(arg)
450
+ ## Set these vals by index:
451
+ #puts "AERRG: #{arg}"
452
+ self[16] = arg
453
+ self[12,3] = self.class.extract_file_info(arg)
454
+ end
455
+ $VERBOSE = tmp_verb
456
+
457
+ undef_method :inspect
458
+ def inspect
459
+ "<Bioworks::Pep sequence: #{sequence}, mass: #{mass}, deltamass: #{deltamass}, charge: #{charge}, xcorr: #{xcorr}, deltacn: #{deltacn}, prots(count):#{prots.size}, base_name: #{base_name}, first_scan: #{first_scan}, last_scan: #{last_scan}, file: #{file}, peptide_probability: #{peptide_probability}, aaseq:#{aaseq}>"
460
+
461
+
462
+ end
463
+
464
+ # if cast == true, then all the data will be cast
465
+ def set_from_hash_given_text(hash)
466
+ self[0,11] = [hash["sequence"], hash["mass"].to_f, hash["deltamass"].to_f, hash["charge"].to_i, hash["xcorr"].to_f, hash["deltacn"].to_f, hash["sp"].to_f, hash["rsp"].to_i, hash["ions"], hash["count"].to_i, hash["tic"].to_i]
467
+ self.file = hash["file"]
468
+ self[15] = hash["peptide_probability"].to_f
469
+ self[19] = SpecID::Pep.sequence_to_aaseq(self[0]) ## aaseq
470
+ end
471
+
472
+ def set_from_xml_stream(fh)
473
+ hash = {}
474
+ while line = fh.gets
475
+ if line =~ @@att_re
476
+ #hash[$1] = $2.dup
477
+ hash[$1] = $2
478
+ #puts "IN PEP: " + $1 + ": " + $2
479
+ elsif line =~ @@end_pep_re
480
+ set_from_hash_given_text(hash)
481
+ #puts "SELF[12]: #{self[12]}"
482
+ #puts "SELF[12]: #{self[12]}"
483
+ break
484
+ else
485
+ puts "Bad parsing on: #{line}"
486
+ puts "EXITING!"
487
+ exit
488
+ end
489
+ end
490
+ self
491
+ end
492
+
493
+ end
494
+
495
+
496
+
497
+
498
+