ms-mascot 0.2.0 → 0.2.2

Sign up to get free protection for your applications and to get access to all the features.
@@ -1,4 +1,76 @@
1
1
  require 'ms/mascot/dat/section'
2
2
 
3
- class Ms::Mascot::Dat::Proteins < Ms::Mascot::Dat::Section
3
+ module Ms::Mascot::Dat
4
+
5
+ # Proteins represent supplementary protein information in a dat file.
6
+ #
7
+ # Content-Type: application/x-Mascot; name="proteins"
8
+ #
9
+ # "ZN711_HUMAN"=87153.77,"Zinc finger protein 711 (Zinc finger protein 6) - Homo sapiens (Human)"
10
+ # "Y986_MYCTU"=27356.31,"Hypothetical ABC transporter ATP-binding protein Rv0986/MT1014 - Mycobacterium tuberculosis"
11
+ # "Y5G0_ENCCU"=33509.30,"Hypothetical protein ECU05_1600/ECU11_0130 - Encephalitozoon cuniculi"
12
+ #
13
+ # Proteins is (almost) a standard Section and defines methods for convenient
14
+ # access.
15
+ class Proteins < Section
16
+
17
+ # === Protein
18
+ #
19
+ # Represents protein data.
20
+ #
21
+ # # 87153.77,"Zinc finger protein 711 (Zinc finger protein 6) - Homo sapiens (Human)"
22
+ #
23
+ # index example meaning
24
+ # 0 87153.77 protein mass in Da
25
+ # 1 "Zinc finger..." a description string
26
+ #
27
+ Protein = Struct.new(
28
+ :mass,
29
+ :description
30
+ )
31
+
32
+ # A format string used to format parameters as a string.
33
+ TO_S_FORMAT = "\"%s\"=%s\n"
34
+
35
+ class << self
36
+
37
+ # Parses a new instance from str. Special parsing is required to quickly
38
+ # remove the quotes from protein keys.
39
+ def parse(str, archive=nil)
40
+ params = {}
41
+ scanner = StringScanner.new(str)
42
+
43
+ # skip whitespace and content type declaration
44
+ unless scanner.scan(Section::CONTENT_TYPE_REGEXP)
45
+ raise "unknown content type: #{content_type}"
46
+ end
47
+ section_name = scanner[1]
48
+
49
+ # scan each pair removing quotes from keys
50
+ while true
51
+ scanner.skip(/"/)
52
+ break unless key = scanner.scan(/[^"]+/)
53
+ scanner.skip(/"\=/)
54
+ params[key] = scanner.scan(/[^\n]*/)
55
+ scanner.skip(/\n/)
56
+ end
57
+
58
+ new(params, section_name, archive)
59
+ end
60
+ end
61
+
62
+ # Returns a Protein for the specified protein id.
63
+ def protein(id)
64
+ parse_protein(data[id])
65
+ end
66
+
67
+ private
68
+
69
+ # Parses a Protein from the protien data string.
70
+ def parse_protein(str)
71
+ return nil unless str
72
+ mass, description = str.split(',')
73
+ Protein.new(mass.to_f, description[1...-1])
74
+ end
75
+ end
4
76
  end
@@ -1,12 +1,147 @@
1
1
  require 'ms/mascot/dat/section'
2
+ require 'ms/mascot/mgf/entry'
3
+ require 'rack'
2
4
 
3
- class Ms::Mascot::Dat::Query < Ms::Mascot::Dat::Section
5
+ module Ms::Mascot::Dat
4
6
 
5
- attr_reader :index
7
+ # Query is a generic section for all queryN sections. Query contains query
8
+ # data that has different meaning depending on the type of search performed.
9
+ # Here is data from an MS/MS search:
10
+ #
11
+ # Content-Type: application/x-Mascot; name="query60"
12
+ #
13
+ # charge=3+
14
+ # mass_min=50.175000
15
+ # mass_max=1998.960000
16
+ # int_min=0.0364
17
+ # int_max=7366
18
+ # num_vals=3411
19
+ # num_used1=-1
20
+ # Ions1=129.098825:384.8,187.070000:461.5...
21
+ # ...
22
+ #
23
+ # Query is a standard Section and simply defines methods for convenient
24
+ # access. See Section for parsing details.
25
+ class Query < Ms::Mascot::Dat::Section
6
26
 
7
- def initialize(data={}, section_name=self.class.section_name)
8
- super(data, section_name)
9
- @index = section_name.strip[5..-1].to_i
10
- end
27
+ module Utils
28
+ module_function
29
+
30
+ # Scans an ion string for values, yielding each number as a string and the
31
+ # a flag signaling whether or not the number marks the end of a datapoint
32
+ # (ie the number is the intensity).
33
+ #
34
+ # str = "\nReformatted Ions\n"
35
+ # Query.scan_ions('1.23:4.56,7.8:9') do |num, end_point|
36
+ # str << num
37
+ # str << (end_point ? "\n" : " ")
38
+ # end
39
+ #
40
+ # str
41
+ # # => %q{
42
+ # # Reformatted Ions
43
+ # # 1.23 4.56
44
+ # # 7.8 9
45
+ # # }
46
+ #
47
+ def scan_ions(str) # :yields: num, end_point
48
+ scanner = StringScanner.new(str)
49
+ while num = scanner.scan(/[^:,]+/)
50
+ if scanner.skip(/:/)
51
+ yield(num, false)
52
+ else
53
+ scanner.skip(/,/)
54
+ yield(num, true)
55
+ end
56
+ end
57
+ end
58
+
59
+ # Parses an ion string into a simple data array. Parse ions requires
60
+ # data points be separated with a comma and mz/intensity values with a
61
+ # semicolon, but is tolerant to integer and floats.
62
+ #
63
+ # Query.parse_ions('1.23:4.56,7.8:9') # => [[1.23, 4.56], [7.8, 9]]
64
+ #
65
+ # All ions are cast to floats; see scan_ions for scanning the string
66
+ # values.
67
+ def parse_ions(str)
68
+ ions = []
69
+ current = []
70
+
71
+ scan_ions(str) do |num, end_point|
72
+ current << num.to_f
73
+
74
+ if end_point
75
+ ions << current
76
+ current = []
77
+ end
78
+ end
79
+ ions
80
+ end
81
+ end
82
+
83
+ include Utils
84
+
85
+ # Returns the query index for self (ie 60 when section_name is 'query60')
86
+ attr_reader :index
87
+
88
+ def initialize(data={}, section_name=self.class.section_name, dat=nil)
89
+ super(data, section_name, dat)
90
+ data['title'] = Rack::Utils.unescape(data['title'].to_s)
91
+ @index = section_name.strip[5..-1].to_i
92
+ @ions=[]
93
+ end
94
+
95
+ # Returns the nth ion string in self.
96
+ def ion_str(n=1)
97
+ data["Ions#{n}"]
98
+ end
99
+
100
+ # Returns a simple array of the parsed nth ion string.
101
+ def ions(n=1)
102
+ @ions[n] ||= parse_ions(ion_str(n))
103
+ end
11
104
 
105
+ def title
106
+ data['title']
107
+ end
108
+
109
+ # allows access to values in data with method calls
110
+ #def method_missing(*args)
111
+ # if args.size == 1 && (val = data[arg.to_s])
112
+ # val
113
+ # else
114
+ # super(*args)
115
+ # end
116
+ #end
117
+
118
+ # returns a Ms::Mascot::Mgf::Entry object.
119
+ # pepmass may be a Numeric OR a PeptideHit object (extracting the pepmass
120
+ # by PeptideHit#peptide_mass + PeptideHit#delta_mass
121
+ # options are:
122
+ #
123
+ # :valid_headers = true (default) | false
124
+ def to_mgf(pepmass, opts={})
125
+ opts = {:valid_headers => true}.merge(opts)
126
+ valid_headers = opts[:valid_headers]
127
+ header = {}
128
+ header['PEPMASS'] =
129
+ if pepmass.is_a? Numeric
130
+ pepmass
131
+ else
132
+ hit = pepmass
133
+ hit.peptide_mass + hit.delta_mass
134
+ end
135
+ data.each_pair do |key,value|
136
+ up = key.to_s.upcase
137
+ next if key =~ /Ions/
138
+ next if valid_headers && !Ms::Mascot::Mgf::VALID_LOCAL_HEADERS.include?(up)
139
+ header[up] = value
140
+ end
141
+ # note that we sort the ions because I think I've seen files without
142
+ # them being sorted
143
+ Ms::Mascot::Mgf::Entry.new(header, self.ions.sort)
144
+ end
145
+
146
+ end
12
147
  end
@@ -30,7 +30,7 @@ module Ms
30
30
  # Parses a new instance from str. Section after then content-type
31
31
  # declaration are parsed into the parameters hash. Section follow
32
32
  # a simple "key=value\n" pattern.
33
- def parse(str)
33
+ def parse(str, archive=nil)
34
34
  params = {}
35
35
  scanner = StringScanner.new(str)
36
36
 
@@ -42,12 +42,12 @@ module Ms
42
42
 
43
43
  # scan each pair.
44
44
  while key = scanner.scan(/[^=]+/)
45
- scanner.skip(/=/)
45
+ scanner.skip(/\=/)
46
46
  params[key] = scanner.scan(/[^\n]*/)
47
47
  scanner.skip(/\n/)
48
48
  end
49
49
 
50
- new(params, section_name)
50
+ new(params, section_name, archive)
51
51
  end
52
52
 
53
53
  # Returns the name of the section represented by this class. Section
@@ -67,9 +67,13 @@ module Ms
67
67
  # The class section_name.
68
68
  attr_reader :section_name
69
69
 
70
- def initialize(data={}, section_name=self.class.section_name)
70
+ # A backreference to the dat containing self.
71
+ attr_reader :dat
72
+
73
+ def initialize(data={}, section_name=self.class.section_name, dat=nil)
71
74
  @data = data
72
75
  @section_name = section_name
76
+ @dat = dat
73
77
  end
74
78
 
75
79
  # Formats self as a string with the content-type header.
@@ -78,7 +82,7 @@ module Ms
78
82
 
79
83
  Content-Type: application/x-Mascot; name="#{section_name}"
80
84
 
81
- #{data.to_a.collect {|entry| TO_S_FORMAT % entry}.join}}
85
+ #{data.to_a.collect {|entry| self.class::TO_S_FORMAT % entry}.join}}
82
86
  end
83
87
  end
84
88
  end
@@ -1,8 +1,226 @@
1
1
  require 'ms/mascot/dat/section'
2
2
 
3
- # Summaries differ in their meaning depending on the type of search but the
4
- # content is in the same format. The best way to add a sensible api and to
5
- # keep the basic archive lookup structure is to define modules that extend
6
- # a summary with, say an MS/MS ion search api.
7
- class Ms::Mascot::Dat::Summary < Ms::Mascot::Dat::Section
3
+ module Ms::Mascot::Dat
4
+
5
+ # Summary represent summary identification information in a dat file.
6
+ # Summaries differ in their meaning depending on the type of search but the
7
+ # content is in the same format. Currently the APIs for each of these
8
+ # distinct searches are mashed together although a saner approach would be
9
+ # to separate them.
10
+ #
11
+ # Content-Type: application/x-Mascot; name="summary"
12
+ #
13
+ # qmass1=497.265612
14
+ # qexp1=498.272888,1+
15
+ # qmatch1=5360
16
+ # qplughole1=0.000000
17
+ # qmass2=499.248736
18
+ # qexp2=500.256012,1+
19
+ # qmatch2=5759
20
+ # qplughole2=16.873721
21
+ # ...
22
+ # h1=CH60_HUMAN,1.40e+03,0.48,61016.38
23
+ # h1_text=60 kDa heat shock protein, mitochondrial precursor (Hsp60) (60 kDa chaperonin) (CPN60) (Heat shock
24
+ # h1_q1=-1
25
+ # h1_q2=-1
26
+ # ...
27
+ # h1_q11=0,832.382767,-0.032939,302,309,6.00,APGFGDNR,16,0000000000,45.35,1,0000002000000000000,0,0,3481.990000
28
+ # h1_q11_terms=K,K
29
+ # h1_q12=0,843.506577,-0.034557,345,352,7.00,VGEVIVTK,24,0000000000,45.74,2,0001002000000000000,0,0,1662.450000
30
+ # h1_q12_terms=K,D
31
+ # ...
32
+ #
33
+ # Summary is a standard Section and simply defines methods for convenient
34
+ # access. See Section for parsing details.
35
+ #
36
+ # === Interpretation
37
+ #
38
+ # Deciphering the protein hit information requires some cross-referencing with
39
+ # online results. Note that each hit references each query.
40
+ #
41
+ # hN=protein # protein hit N
42
+ # hN_text=description # description for hit N
43
+ # hN_qM=-1 # no peptide from query
44
+ # hN_qM=query # match for hit N from query M
45
+ # hN_qM=A,B:C,D # n and c-termini residues for each protein match
46
+ #
47
+ # See the ProteinHit and QueryHit structures for interpretation of the
48
+ # specific hit data.
49
+ #--
50
+ #
51
+ class Summary < Section
52
+
53
+ # === ProteinHit
54
+ #
55
+ # Represents protein hit data, infered by inspection of the MS/MS sample
56
+ # results, esp {F981123.dat}[http://www.matrixscience.com/cgi/peptide_view.pl?file=../data/F981123.dat&query=2&hit=1&index=&px=1&section=5&ave_thresh=38].
57
+ #
58
+ # # str: CH60_HUMAN,1.40e+03,0.48,61016.38
59
+ # # desc: 60 kDa heat shock protein...
60
+ #
61
+ # index example meaning
62
+ # 0 CH60_HUMAN id
63
+ # 1 1.40e+03
64
+ # 2 0.48
65
+ # 3 61016.38 mass
66
+ # 4 60 kDa heat... text
67
+ #
68
+ ProteinHit = Struct.new(
69
+ :id,
70
+ :unknown1,
71
+ :unknown2,
72
+ :mass,
73
+ :text,
74
+ :query_hits
75
+ )
76
+
77
+ # Indicies of ProteinHit terms that will be cast to floats.
78
+ ProteinHitFloatIndicies = [1,2,3]
79
+
80
+ # === QueryHit
81
+ #
82
+ # Represents query data, infered by inspection of the MS/MS sample
83
+ # results, esp {F981123.dat}[http://www.matrixscience.com/cgi/peptide_view.pl?file=../data/F981123.dat&query=2&hit=1&index=&px=1&section=5&ave_thresh=38].
84
+ #
85
+ # # str: 0,832.382767,-0.032939,302,309,6.00,APGFGDNR,16,0000000000,45.35,1,0000002000000000000,0,0,3481.990000
86
+ # # terms: K,R
87
+ #
88
+ # index example meaning
89
+ # 0 0 n Missed Cleavages
90
+ # 1 832.382767 Monoisotopic mass of neutral peptide Mr(calc)
91
+ # 2 -0.032939 actual - theoretical delta mass
92
+ # 3 302 peptide start index
93
+ # 4 309 peptide end index
94
+ # 5 6.00
95
+ # 6 APGFGDNR peptide sequence
96
+ # 7 16
97
+ # 8 0000000000 modification sites (including n,c residues; number indicates mod)
98
+ # 9 45.35 score
99
+ # 10 1
100
+ # 11 0000002000000000000
101
+ # 12 0
102
+ # 13 0
103
+ # 14 3481.990000
104
+ # 15 K nterm
105
+ # 16 R cterm
106
+ #
107
+ # The dat file is said to be generate by Mascot version 1.0, but the headers
108
+ # section records 2.1.119.
109
+ QueryHit = Struct.new(
110
+ :n_missed_cleavages,
111
+ :peptide_mass,
112
+ :delta_mass,
113
+ :peptide_start,
114
+ :peptide_end,
115
+ :unknown5,
116
+ :sequence,
117
+ :unknown7,
118
+ :modifications,
119
+ :score,
120
+ :unknown10,
121
+ :unknown11,
122
+ :unknown12,
123
+ :unknown13,
124
+ :unknown14,
125
+ :nterm,
126
+ :cterm
127
+ )
128
+
129
+ # Indicies of QueryHit terms that will be cast to floats.
130
+ QueryHitFloatIndicies = [1,2,5,9,14]
131
+
132
+ # Indicies of QueryHit terms that will be cast to integers.
133
+ QueryHitIntIndicies = [0,3,4,7,10,12,13]
134
+
135
+ module Utils
136
+ module_function
137
+
138
+ # Parses a ProteinHit from the hit string.
139
+ def parse_protein_hit(str, desc, query_hits)
140
+ data = str.split(",")
141
+ ProteinHitFloatIndicies.each do |index|
142
+ data[index] = data[index].to_f
143
+ end
144
+ data << desc
145
+ data << query_hits
146
+
147
+ ProteinHit.new(*data)
148
+ end
149
+
150
+ # Parses a QueryHit from the hit-query string.
151
+ def parse_query_hit(str, terms)
152
+ return nil if str == nil || str == "-1"
153
+
154
+ data = str.split(",") + terms.split(",")
155
+ QueryHitFloatIndicies.each do |index|
156
+ data[index] = data[index].to_f
157
+ end
158
+ QueryHitIntIndicies.each do |index|
159
+ data[index] = data[index].to_i
160
+ end
161
+
162
+ QueryHit.new(*data)
163
+ end
164
+ end
165
+
166
+ include Utils
167
+
168
+ def initialize(data={}, section_name=self.class.section_name, dat=nil)
169
+ super(data, section_name, dat)
170
+ @protein_hits = []
171
+ @query_hits = []
172
+ end
173
+
174
+ # An array of protein hits. Specify resolve=false to return just the
175
+ # currently parsed hits.
176
+ #
177
+ # Note that the hits array is indexed the same as in Mascot, ie the
178
+ # ProteinHit for h1 is located at hits[1], meaning there is always
179
+ # an empty cell at hits[0].
180
+ def protein_hits(resolve=true)
181
+ return @protein_hits unless resolve
182
+
183
+ hit = 1
184
+ hit += 1 while protein_hit(hit)
185
+ @protein_hits
186
+ end
187
+
188
+ # Returns a ProteinHit at the hit index, or nil if no such hit exists.
189
+ def protein_hit(hit)
190
+ key = "h#{hit}"
191
+ return nil unless str = data[key]
192
+ @protein_hits[hit] ||= parse_protein_hit(str, data["#{key}_text"], query_hits(hit))
193
+ end
194
+
195
+ # Returns an array of QueryHits for the specified hit, or nil if no
196
+ # such hit exists.
197
+ def query_hits(hit)
198
+ query = 1
199
+ while data.has_key?("h#{hit}_q#{query}")
200
+ query_hit(hit, query)
201
+ query += 1
202
+ end
203
+
204
+ @query_hits[hit]
205
+ end
206
+
207
+ # Returns the QueryHit at the hit and query index, or nil if no such query
208
+ # exists.
209
+ def query_hit(hit, query)
210
+ key = "h#{hit}_q#{query}"
211
+ return nil unless data.has_key?(key)
212
+
213
+ queries = @query_hits[hit] ||= []
214
+ if existing_query = queries[query]
215
+ return existing_query
216
+ end
217
+
218
+ if parsed_query = parse_query_hit(data[key], data["#{key}_terms"])
219
+ queries[query] = parsed_query
220
+ return parsed_query
221
+ end
222
+
223
+ nil
224
+ end
225
+ end
8
226
  end