ms-mascot 0.2.0 → 0.2.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,4 +1,76 @@
1
1
  require 'ms/mascot/dat/section'
2
2
 
3
- class Ms::Mascot::Dat::Proteins < Ms::Mascot::Dat::Section
3
+ module Ms::Mascot::Dat
4
+
5
+ # Proteins represent supplementary protein information in a dat file.
6
+ #
7
+ # Content-Type: application/x-Mascot; name="proteins"
8
+ #
9
+ # "ZN711_HUMAN"=87153.77,"Zinc finger protein 711 (Zinc finger protein 6) - Homo sapiens (Human)"
10
+ # "Y986_MYCTU"=27356.31,"Hypothetical ABC transporter ATP-binding protein Rv0986/MT1014 - Mycobacterium tuberculosis"
11
+ # "Y5G0_ENCCU"=33509.30,"Hypothetical protein ECU05_1600/ECU11_0130 - Encephalitozoon cuniculi"
12
+ #
13
+ # Proteins is (almost) a standard Section and defines methods for convenient
14
+ # access.
15
+ class Proteins < Section
16
+
17
+ # === Protein
18
+ #
19
+ # Represents protein data.
20
+ #
21
+ # # 87153.77,"Zinc finger protein 711 (Zinc finger protein 6) - Homo sapiens (Human)"
22
+ #
23
+ # index example meaning
24
+ # 0 87153.77 protein mass in Da
25
+ # 1 "Zinc finger..." a description string
26
+ #
27
+ Protein = Struct.new(
28
+ :mass,
29
+ :description
30
+ )
31
+
32
+ # A format string used to format parameters as a string.
33
+ TO_S_FORMAT = "\"%s\"=%s\n"
34
+
35
+ class << self
36
+
37
+ # Parses a new instance from str. Special parsing is required to quickly
38
+ # remove the quotes from protein keys.
39
+ def parse(str, archive=nil)
40
+ params = {}
41
+ scanner = StringScanner.new(str)
42
+
43
+ # skip whitespace and content type declaration
44
+ unless scanner.scan(Section::CONTENT_TYPE_REGEXP)
45
+ raise "unknown content type: #{content_type}"
46
+ end
47
+ section_name = scanner[1]
48
+
49
+ # scan each pair removing quotes from keys
50
+ while true
51
+ scanner.skip(/"/)
52
+ break unless key = scanner.scan(/[^"]+/)
53
+ scanner.skip(/"\=/)
54
+ params[key] = scanner.scan(/[^\n]*/)
55
+ scanner.skip(/\n/)
56
+ end
57
+
58
+ new(params, section_name, archive)
59
+ end
60
+ end
61
+
62
+ # Returns a Protein for the specified protein id.
63
+ def protein(id)
64
+ parse_protein(data[id])
65
+ end
66
+
67
+ private
68
+
69
+ # Parses a Protein from the protien data string.
70
+ def parse_protein(str)
71
+ return nil unless str
72
+ mass, description = str.split(',')
73
+ Protein.new(mass.to_f, description[1...-1])
74
+ end
75
+ end
4
76
  end
@@ -1,12 +1,147 @@
1
1
  require 'ms/mascot/dat/section'
2
+ require 'ms/mascot/mgf/entry'
3
+ require 'rack'
2
4
 
3
- class Ms::Mascot::Dat::Query < Ms::Mascot::Dat::Section
5
+ module Ms::Mascot::Dat
4
6
 
5
- attr_reader :index
7
+ # Query is a generic section for all queryN sections. Query contains query
8
+ # data that has different meaning depending on the type of search performed.
9
+ # Here is data from an MS/MS search:
10
+ #
11
+ # Content-Type: application/x-Mascot; name="query60"
12
+ #
13
+ # charge=3+
14
+ # mass_min=50.175000
15
+ # mass_max=1998.960000
16
+ # int_min=0.0364
17
+ # int_max=7366
18
+ # num_vals=3411
19
+ # num_used1=-1
20
+ # Ions1=129.098825:384.8,187.070000:461.5...
21
+ # ...
22
+ #
23
+ # Query is a standard Section and simply defines methods for convenient
24
+ # access. See Section for parsing details.
25
+ class Query < Ms::Mascot::Dat::Section
6
26
 
7
- def initialize(data={}, section_name=self.class.section_name)
8
- super(data, section_name)
9
- @index = section_name.strip[5..-1].to_i
10
- end
27
+ module Utils
28
+ module_function
29
+
30
+ # Scans an ion string for values, yielding each number as a string and the
31
+ # a flag signaling whether or not the number marks the end of a datapoint
32
+ # (ie the number is the intensity).
33
+ #
34
+ # str = "\nReformatted Ions\n"
35
+ # Query.scan_ions('1.23:4.56,7.8:9') do |num, end_point|
36
+ # str << num
37
+ # str << (end_point ? "\n" : " ")
38
+ # end
39
+ #
40
+ # str
41
+ # # => %q{
42
+ # # Reformatted Ions
43
+ # # 1.23 4.56
44
+ # # 7.8 9
45
+ # # }
46
+ #
47
+ def scan_ions(str) # :yields: num, end_point
48
+ scanner = StringScanner.new(str)
49
+ while num = scanner.scan(/[^:,]+/)
50
+ if scanner.skip(/:/)
51
+ yield(num, false)
52
+ else
53
+ scanner.skip(/,/)
54
+ yield(num, true)
55
+ end
56
+ end
57
+ end
58
+
59
+ # Parses an ion string into a simple data array. Parse ions requires
60
+ # data points be separated with a comma and mz/intensity values with a
61
+ # semicolon, but is tolerant to integer and floats.
62
+ #
63
+ # Query.parse_ions('1.23:4.56,7.8:9') # => [[1.23, 4.56], [7.8, 9]]
64
+ #
65
+ # All ions are cast to floats; see scan_ions for scanning the string
66
+ # values.
67
+ def parse_ions(str)
68
+ ions = []
69
+ current = []
70
+
71
+ scan_ions(str) do |num, end_point|
72
+ current << num.to_f
73
+
74
+ if end_point
75
+ ions << current
76
+ current = []
77
+ end
78
+ end
79
+ ions
80
+ end
81
+ end
82
+
83
+ include Utils
84
+
85
+ # Returns the query index for self (ie 60 when section_name is 'query60')
86
+ attr_reader :index
87
+
88
+ def initialize(data={}, section_name=self.class.section_name, dat=nil)
89
+ super(data, section_name, dat)
90
+ data['title'] = Rack::Utils.unescape(data['title'].to_s)
91
+ @index = section_name.strip[5..-1].to_i
92
+ @ions=[]
93
+ end
94
+
95
+ # Returns the nth ion string in self.
96
+ def ion_str(n=1)
97
+ data["Ions#{n}"]
98
+ end
99
+
100
+ # Returns a simple array of the parsed nth ion string.
101
+ def ions(n=1)
102
+ @ions[n] ||= parse_ions(ion_str(n))
103
+ end
11
104
 
105
+ def title
106
+ data['title']
107
+ end
108
+
109
+ # allows access to values in data with method calls
110
+ #def method_missing(*args)
111
+ # if args.size == 1 && (val = data[arg.to_s])
112
+ # val
113
+ # else
114
+ # super(*args)
115
+ # end
116
+ #end
117
+
118
+ # returns a Ms::Mascot::Mgf::Entry object.
119
+ # pepmass may be a Numeric OR a PeptideHit object (extracting the pepmass
120
+ # by PeptideHit#peptide_mass + PeptideHit#delta_mass
121
+ # options are:
122
+ #
123
+ # :valid_headers = true (default) | false
124
+ def to_mgf(pepmass, opts={})
125
+ opts = {:valid_headers => true}.merge(opts)
126
+ valid_headers = opts[:valid_headers]
127
+ header = {}
128
+ header['PEPMASS'] =
129
+ if pepmass.is_a? Numeric
130
+ pepmass
131
+ else
132
+ hit = pepmass
133
+ hit.peptide_mass + hit.delta_mass
134
+ end
135
+ data.each_pair do |key,value|
136
+ up = key.to_s.upcase
137
+ next if key =~ /Ions/
138
+ next if valid_headers && !Ms::Mascot::Mgf::VALID_LOCAL_HEADERS.include?(up)
139
+ header[up] = value
140
+ end
141
+ # note that we sort the ions because I think I've seen files without
142
+ # them being sorted
143
+ Ms::Mascot::Mgf::Entry.new(header, self.ions.sort)
144
+ end
145
+
146
+ end
12
147
  end
@@ -30,7 +30,7 @@ module Ms
30
30
  # Parses a new instance from str. Section after then content-type
31
31
  # declaration are parsed into the parameters hash. Section follow
32
32
  # a simple "key=value\n" pattern.
33
- def parse(str)
33
+ def parse(str, archive=nil)
34
34
  params = {}
35
35
  scanner = StringScanner.new(str)
36
36
 
@@ -42,12 +42,12 @@ module Ms
42
42
 
43
43
  # scan each pair.
44
44
  while key = scanner.scan(/[^=]+/)
45
- scanner.skip(/=/)
45
+ scanner.skip(/\=/)
46
46
  params[key] = scanner.scan(/[^\n]*/)
47
47
  scanner.skip(/\n/)
48
48
  end
49
49
 
50
- new(params, section_name)
50
+ new(params, section_name, archive)
51
51
  end
52
52
 
53
53
  # Returns the name of the section represented by this class. Section
@@ -67,9 +67,13 @@ module Ms
67
67
  # The class section_name.
68
68
  attr_reader :section_name
69
69
 
70
- def initialize(data={}, section_name=self.class.section_name)
70
+ # A backreference to the dat containing self.
71
+ attr_reader :dat
72
+
73
+ def initialize(data={}, section_name=self.class.section_name, dat=nil)
71
74
  @data = data
72
75
  @section_name = section_name
76
+ @dat = dat
73
77
  end
74
78
 
75
79
  # Formats self as a string with the content-type header.
@@ -78,7 +82,7 @@ module Ms
78
82
 
79
83
  Content-Type: application/x-Mascot; name="#{section_name}"
80
84
 
81
- #{data.to_a.collect {|entry| TO_S_FORMAT % entry}.join}}
85
+ #{data.to_a.collect {|entry| self.class::TO_S_FORMAT % entry}.join}}
82
86
  end
83
87
  end
84
88
  end
@@ -1,8 +1,226 @@
1
1
  require 'ms/mascot/dat/section'
2
2
 
3
- # Summaries differ in their meaning depending on the type of search but the
4
- # content is in the same format. The best way to add a sensible api and to
5
- # keep the basic archive lookup structure is to define modules that extend
6
- # a summary with, say an MS/MS ion search api.
7
- class Ms::Mascot::Dat::Summary < Ms::Mascot::Dat::Section
3
+ module Ms::Mascot::Dat
4
+
5
+ # Summary represent summary identification information in a dat file.
6
+ # Summaries differ in their meaning depending on the type of search but the
7
+ # content is in the same format. Currently the APIs for each of these
8
+ # distinct searches are mashed together although a saner approach would be
9
+ # to separate them.
10
+ #
11
+ # Content-Type: application/x-Mascot; name="summary"
12
+ #
13
+ # qmass1=497.265612
14
+ # qexp1=498.272888,1+
15
+ # qmatch1=5360
16
+ # qplughole1=0.000000
17
+ # qmass2=499.248736
18
+ # qexp2=500.256012,1+
19
+ # qmatch2=5759
20
+ # qplughole2=16.873721
21
+ # ...
22
+ # h1=CH60_HUMAN,1.40e+03,0.48,61016.38
23
+ # h1_text=60 kDa heat shock protein, mitochondrial precursor (Hsp60) (60 kDa chaperonin) (CPN60) (Heat shock
24
+ # h1_q1=-1
25
+ # h1_q2=-1
26
+ # ...
27
+ # h1_q11=0,832.382767,-0.032939,302,309,6.00,APGFGDNR,16,0000000000,45.35,1,0000002000000000000,0,0,3481.990000
28
+ # h1_q11_terms=K,K
29
+ # h1_q12=0,843.506577,-0.034557,345,352,7.00,VGEVIVTK,24,0000000000,45.74,2,0001002000000000000,0,0,1662.450000
30
+ # h1_q12_terms=K,D
31
+ # ...
32
+ #
33
+ # Summary is a standard Section and simply defines methods for convenient
34
+ # access. See Section for parsing details.
35
+ #
36
+ # === Interpretation
37
+ #
38
+ # Deciphering the protein hit information requires some cross-referencing with
39
+ # online results. Note that each hit references each query.
40
+ #
41
+ # hN=protein # protein hit N
42
+ # hN_text=description # description for hit N
43
+ # hN_qM=-1 # no peptide from query
44
+ # hN_qM=query # match for hit N from query M
45
+ # hN_qM=A,B:C,D # n and c-termini residues for each protein match
46
+ #
47
+ # See the ProteinHit and QueryHit structures for interpretation of the
48
+ # specific hit data.
49
+ #--
50
+ #
51
+ class Summary < Section
52
+
53
+ # === ProteinHit
54
+ #
55
+ # Represents protein hit data, infered by inspection of the MS/MS sample
56
+ # results, esp {F981123.dat}[http://www.matrixscience.com/cgi/peptide_view.pl?file=../data/F981123.dat&query=2&hit=1&index=&px=1&section=5&ave_thresh=38].
57
+ #
58
+ # # str: CH60_HUMAN,1.40e+03,0.48,61016.38
59
+ # # desc: 60 kDa heat shock protein...
60
+ #
61
+ # index example meaning
62
+ # 0 CH60_HUMAN id
63
+ # 1 1.40e+03
64
+ # 2 0.48
65
+ # 3 61016.38 mass
66
+ # 4 60 kDa heat... text
67
+ #
68
+ ProteinHit = Struct.new(
69
+ :id,
70
+ :unknown1,
71
+ :unknown2,
72
+ :mass,
73
+ :text,
74
+ :query_hits
75
+ )
76
+
77
+ # Indicies of ProteinHit terms that will be cast to floats.
78
+ ProteinHitFloatIndicies = [1,2,3]
79
+
80
+ # === QueryHit
81
+ #
82
+ # Represents query data, infered by inspection of the MS/MS sample
83
+ # results, esp {F981123.dat}[http://www.matrixscience.com/cgi/peptide_view.pl?file=../data/F981123.dat&query=2&hit=1&index=&px=1&section=5&ave_thresh=38].
84
+ #
85
+ # # str: 0,832.382767,-0.032939,302,309,6.00,APGFGDNR,16,0000000000,45.35,1,0000002000000000000,0,0,3481.990000
86
+ # # terms: K,R
87
+ #
88
+ # index example meaning
89
+ # 0 0 n Missed Cleavages
90
+ # 1 832.382767 Monoisotopic mass of neutral peptide Mr(calc)
91
+ # 2 -0.032939 actual - theoretical delta mass
92
+ # 3 302 peptide start index
93
+ # 4 309 peptide end index
94
+ # 5 6.00
95
+ # 6 APGFGDNR peptide sequence
96
+ # 7 16
97
+ # 8 0000000000 modification sites (including n,c residues; number indicates mod)
98
+ # 9 45.35 score
99
+ # 10 1
100
+ # 11 0000002000000000000
101
+ # 12 0
102
+ # 13 0
103
+ # 14 3481.990000
104
+ # 15 K nterm
105
+ # 16 R cterm
106
+ #
107
+ # The dat file is said to be generate by Mascot version 1.0, but the headers
108
+ # section records 2.1.119.
109
+ QueryHit = Struct.new(
110
+ :n_missed_cleavages,
111
+ :peptide_mass,
112
+ :delta_mass,
113
+ :peptide_start,
114
+ :peptide_end,
115
+ :unknown5,
116
+ :sequence,
117
+ :unknown7,
118
+ :modifications,
119
+ :score,
120
+ :unknown10,
121
+ :unknown11,
122
+ :unknown12,
123
+ :unknown13,
124
+ :unknown14,
125
+ :nterm,
126
+ :cterm
127
+ )
128
+
129
+ # Indicies of QueryHit terms that will be cast to floats.
130
+ QueryHitFloatIndicies = [1,2,5,9,14]
131
+
132
+ # Indicies of QueryHit terms that will be cast to integers.
133
+ QueryHitIntIndicies = [0,3,4,7,10,12,13]
134
+
135
+ module Utils
136
+ module_function
137
+
138
+ # Parses a ProteinHit from the hit string.
139
+ def parse_protein_hit(str, desc, query_hits)
140
+ data = str.split(",")
141
+ ProteinHitFloatIndicies.each do |index|
142
+ data[index] = data[index].to_f
143
+ end
144
+ data << desc
145
+ data << query_hits
146
+
147
+ ProteinHit.new(*data)
148
+ end
149
+
150
+ # Parses a QueryHit from the hit-query string.
151
+ def parse_query_hit(str, terms)
152
+ return nil if str == nil || str == "-1"
153
+
154
+ data = str.split(",") + terms.split(",")
155
+ QueryHitFloatIndicies.each do |index|
156
+ data[index] = data[index].to_f
157
+ end
158
+ QueryHitIntIndicies.each do |index|
159
+ data[index] = data[index].to_i
160
+ end
161
+
162
+ QueryHit.new(*data)
163
+ end
164
+ end
165
+
166
+ include Utils
167
+
168
+ def initialize(data={}, section_name=self.class.section_name, dat=nil)
169
+ super(data, section_name, dat)
170
+ @protein_hits = []
171
+ @query_hits = []
172
+ end
173
+
174
+ # An array of protein hits. Specify resolve=false to return just the
175
+ # currently parsed hits.
176
+ #
177
+ # Note that the hits array is indexed the same as in Mascot, ie the
178
+ # ProteinHit for h1 is located at hits[1], meaning there is always
179
+ # an empty cell at hits[0].
180
+ def protein_hits(resolve=true)
181
+ return @protein_hits unless resolve
182
+
183
+ hit = 1
184
+ hit += 1 while protein_hit(hit)
185
+ @protein_hits
186
+ end
187
+
188
+ # Returns a ProteinHit at the hit index, or nil if no such hit exists.
189
+ def protein_hit(hit)
190
+ key = "h#{hit}"
191
+ return nil unless str = data[key]
192
+ @protein_hits[hit] ||= parse_protein_hit(str, data["#{key}_text"], query_hits(hit))
193
+ end
194
+
195
+ # Returns an array of QueryHits for the specified hit, or nil if no
196
+ # such hit exists.
197
+ def query_hits(hit)
198
+ query = 1
199
+ while data.has_key?("h#{hit}_q#{query}")
200
+ query_hit(hit, query)
201
+ query += 1
202
+ end
203
+
204
+ @query_hits[hit]
205
+ end
206
+
207
+ # Returns the QueryHit at the hit and query index, or nil if no such query
208
+ # exists.
209
+ def query_hit(hit, query)
210
+ key = "h#{hit}_q#{query}"
211
+ return nil unless data.has_key?(key)
212
+
213
+ queries = @query_hits[hit] ||= []
214
+ if existing_query = queries[query]
215
+ return existing_query
216
+ end
217
+
218
+ if parsed_query = parse_query_hit(data[key], data["#{key}_terms"])
219
+ queries[query] = parsed_query
220
+ return parsed_query
221
+ end
222
+
223
+ nil
224
+ end
225
+ end
8
226
  end