ms-mascot 0.2.0 → 0.2.2
Sign up to get free protection for your applications and to get access to all the features.
- data/History +11 -1
- data/lib/ms/mascot.rb +4 -1
- data/lib/ms/mascot/dat/archive.rb +85 -8
- data/lib/ms/mascot/dat/header.rb +12 -0
- data/lib/ms/mascot/dat/index.rb +13 -19
- data/lib/ms/mascot/dat/masses.rb +14 -0
- data/lib/ms/mascot/dat/parameters.rb +14 -0
- data/lib/ms/mascot/dat/peptides.rb +211 -2
- data/lib/ms/mascot/dat/proteins.rb +73 -1
- data/lib/ms/mascot/dat/query.rb +141 -6
- data/lib/ms/mascot/dat/section.rb +9 -5
- data/lib/ms/mascot/dat/summary.rb +223 -5
- data/lib/ms/mascot/export.rb +55 -72
- data/lib/ms/mascot/format_mgf.rb +7 -8
- data/lib/ms/mascot/fragment.rb +1 -1
- data/lib/ms/mascot/mgf.rb +32 -2
- data/lib/ms/mascot/mgf/archive.rb +8 -1
- data/lib/ms/mascot/submit.rb +52 -69
- data/lib/ms/mascot/validation.rb +17 -0
- metadata +7 -6
@@ -1,4 +1,76 @@
|
|
1
1
|
require 'ms/mascot/dat/section'
|
2
2
|
|
3
|
-
|
3
|
+
module Ms::Mascot::Dat
|
4
|
+
|
5
|
+
# Proteins represent supplementary protein information in a dat file.
|
6
|
+
#
|
7
|
+
# Content-Type: application/x-Mascot; name="proteins"
|
8
|
+
#
|
9
|
+
# "ZN711_HUMAN"=87153.77,"Zinc finger protein 711 (Zinc finger protein 6) - Homo sapiens (Human)"
|
10
|
+
# "Y986_MYCTU"=27356.31,"Hypothetical ABC transporter ATP-binding protein Rv0986/MT1014 - Mycobacterium tuberculosis"
|
11
|
+
# "Y5G0_ENCCU"=33509.30,"Hypothetical protein ECU05_1600/ECU11_0130 - Encephalitozoon cuniculi"
|
12
|
+
#
|
13
|
+
# Proteins is (almost) a standard Section and defines methods for convenient
|
14
|
+
# access.
|
15
|
+
class Proteins < Section
|
16
|
+
|
17
|
+
# === Protein
|
18
|
+
#
|
19
|
+
# Represents protein data.
|
20
|
+
#
|
21
|
+
# # 87153.77,"Zinc finger protein 711 (Zinc finger protein 6) - Homo sapiens (Human)"
|
22
|
+
#
|
23
|
+
# index example meaning
|
24
|
+
# 0 87153.77 protein mass in Da
|
25
|
+
# 1 "Zinc finger..." a description string
|
26
|
+
#
|
27
|
+
Protein = Struct.new(
|
28
|
+
:mass,
|
29
|
+
:description
|
30
|
+
)
|
31
|
+
|
32
|
+
# A format string used to format parameters as a string.
|
33
|
+
TO_S_FORMAT = "\"%s\"=%s\n"
|
34
|
+
|
35
|
+
class << self
|
36
|
+
|
37
|
+
# Parses a new instance from str. Special parsing is required to quickly
|
38
|
+
# remove the quotes from protein keys.
|
39
|
+
def parse(str, archive=nil)
|
40
|
+
params = {}
|
41
|
+
scanner = StringScanner.new(str)
|
42
|
+
|
43
|
+
# skip whitespace and content type declaration
|
44
|
+
unless scanner.scan(Section::CONTENT_TYPE_REGEXP)
|
45
|
+
raise "unknown content type: #{content_type}"
|
46
|
+
end
|
47
|
+
section_name = scanner[1]
|
48
|
+
|
49
|
+
# scan each pair removing quotes from keys
|
50
|
+
while true
|
51
|
+
scanner.skip(/"/)
|
52
|
+
break unless key = scanner.scan(/[^"]+/)
|
53
|
+
scanner.skip(/"\=/)
|
54
|
+
params[key] = scanner.scan(/[^\n]*/)
|
55
|
+
scanner.skip(/\n/)
|
56
|
+
end
|
57
|
+
|
58
|
+
new(params, section_name, archive)
|
59
|
+
end
|
60
|
+
end
|
61
|
+
|
62
|
+
# Returns a Protein for the specified protein id.
|
63
|
+
def protein(id)
|
64
|
+
parse_protein(data[id])
|
65
|
+
end
|
66
|
+
|
67
|
+
private
|
68
|
+
|
69
|
+
# Parses a Protein from the protien data string.
|
70
|
+
def parse_protein(str)
|
71
|
+
return nil unless str
|
72
|
+
mass, description = str.split(',')
|
73
|
+
Protein.new(mass.to_f, description[1...-1])
|
74
|
+
end
|
75
|
+
end
|
4
76
|
end
|
data/lib/ms/mascot/dat/query.rb
CHANGED
@@ -1,12 +1,147 @@
|
|
1
1
|
require 'ms/mascot/dat/section'
|
2
|
+
require 'ms/mascot/mgf/entry'
|
3
|
+
require 'rack'
|
2
4
|
|
3
|
-
|
5
|
+
module Ms::Mascot::Dat
|
4
6
|
|
5
|
-
|
7
|
+
# Query is a generic section for all queryN sections. Query contains query
|
8
|
+
# data that has different meaning depending on the type of search performed.
|
9
|
+
# Here is data from an MS/MS search:
|
10
|
+
#
|
11
|
+
# Content-Type: application/x-Mascot; name="query60"
|
12
|
+
#
|
13
|
+
# charge=3+
|
14
|
+
# mass_min=50.175000
|
15
|
+
# mass_max=1998.960000
|
16
|
+
# int_min=0.0364
|
17
|
+
# int_max=7366
|
18
|
+
# num_vals=3411
|
19
|
+
# num_used1=-1
|
20
|
+
# Ions1=129.098825:384.8,187.070000:461.5...
|
21
|
+
# ...
|
22
|
+
#
|
23
|
+
# Query is a standard Section and simply defines methods for convenient
|
24
|
+
# access. See Section for parsing details.
|
25
|
+
class Query < Ms::Mascot::Dat::Section
|
6
26
|
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
|
27
|
+
module Utils
|
28
|
+
module_function
|
29
|
+
|
30
|
+
# Scans an ion string for values, yielding each number as a string and the
|
31
|
+
# a flag signaling whether or not the number marks the end of a datapoint
|
32
|
+
# (ie the number is the intensity).
|
33
|
+
#
|
34
|
+
# str = "\nReformatted Ions\n"
|
35
|
+
# Query.scan_ions('1.23:4.56,7.8:9') do |num, end_point|
|
36
|
+
# str << num
|
37
|
+
# str << (end_point ? "\n" : " ")
|
38
|
+
# end
|
39
|
+
#
|
40
|
+
# str
|
41
|
+
# # => %q{
|
42
|
+
# # Reformatted Ions
|
43
|
+
# # 1.23 4.56
|
44
|
+
# # 7.8 9
|
45
|
+
# # }
|
46
|
+
#
|
47
|
+
def scan_ions(str) # :yields: num, end_point
|
48
|
+
scanner = StringScanner.new(str)
|
49
|
+
while num = scanner.scan(/[^:,]+/)
|
50
|
+
if scanner.skip(/:/)
|
51
|
+
yield(num, false)
|
52
|
+
else
|
53
|
+
scanner.skip(/,/)
|
54
|
+
yield(num, true)
|
55
|
+
end
|
56
|
+
end
|
57
|
+
end
|
58
|
+
|
59
|
+
# Parses an ion string into a simple data array. Parse ions requires
|
60
|
+
# data points be separated with a comma and mz/intensity values with a
|
61
|
+
# semicolon, but is tolerant to integer and floats.
|
62
|
+
#
|
63
|
+
# Query.parse_ions('1.23:4.56,7.8:9') # => [[1.23, 4.56], [7.8, 9]]
|
64
|
+
#
|
65
|
+
# All ions are cast to floats; see scan_ions for scanning the string
|
66
|
+
# values.
|
67
|
+
def parse_ions(str)
|
68
|
+
ions = []
|
69
|
+
current = []
|
70
|
+
|
71
|
+
scan_ions(str) do |num, end_point|
|
72
|
+
current << num.to_f
|
73
|
+
|
74
|
+
if end_point
|
75
|
+
ions << current
|
76
|
+
current = []
|
77
|
+
end
|
78
|
+
end
|
79
|
+
ions
|
80
|
+
end
|
81
|
+
end
|
82
|
+
|
83
|
+
include Utils
|
84
|
+
|
85
|
+
# Returns the query index for self (ie 60 when section_name is 'query60')
|
86
|
+
attr_reader :index
|
87
|
+
|
88
|
+
def initialize(data={}, section_name=self.class.section_name, dat=nil)
|
89
|
+
super(data, section_name, dat)
|
90
|
+
data['title'] = Rack::Utils.unescape(data['title'].to_s)
|
91
|
+
@index = section_name.strip[5..-1].to_i
|
92
|
+
@ions=[]
|
93
|
+
end
|
94
|
+
|
95
|
+
# Returns the nth ion string in self.
|
96
|
+
def ion_str(n=1)
|
97
|
+
data["Ions#{n}"]
|
98
|
+
end
|
99
|
+
|
100
|
+
# Returns a simple array of the parsed nth ion string.
|
101
|
+
def ions(n=1)
|
102
|
+
@ions[n] ||= parse_ions(ion_str(n))
|
103
|
+
end
|
11
104
|
|
105
|
+
def title
|
106
|
+
data['title']
|
107
|
+
end
|
108
|
+
|
109
|
+
# allows access to values in data with method calls
|
110
|
+
#def method_missing(*args)
|
111
|
+
# if args.size == 1 && (val = data[arg.to_s])
|
112
|
+
# val
|
113
|
+
# else
|
114
|
+
# super(*args)
|
115
|
+
# end
|
116
|
+
#end
|
117
|
+
|
118
|
+
# returns a Ms::Mascot::Mgf::Entry object.
|
119
|
+
# pepmass may be a Numeric OR a PeptideHit object (extracting the pepmass
|
120
|
+
# by PeptideHit#peptide_mass + PeptideHit#delta_mass
|
121
|
+
# options are:
|
122
|
+
#
|
123
|
+
# :valid_headers = true (default) | false
|
124
|
+
def to_mgf(pepmass, opts={})
|
125
|
+
opts = {:valid_headers => true}.merge(opts)
|
126
|
+
valid_headers = opts[:valid_headers]
|
127
|
+
header = {}
|
128
|
+
header['PEPMASS'] =
|
129
|
+
if pepmass.is_a? Numeric
|
130
|
+
pepmass
|
131
|
+
else
|
132
|
+
hit = pepmass
|
133
|
+
hit.peptide_mass + hit.delta_mass
|
134
|
+
end
|
135
|
+
data.each_pair do |key,value|
|
136
|
+
up = key.to_s.upcase
|
137
|
+
next if key =~ /Ions/
|
138
|
+
next if valid_headers && !Ms::Mascot::Mgf::VALID_LOCAL_HEADERS.include?(up)
|
139
|
+
header[up] = value
|
140
|
+
end
|
141
|
+
# note that we sort the ions because I think I've seen files without
|
142
|
+
# them being sorted
|
143
|
+
Ms::Mascot::Mgf::Entry.new(header, self.ions.sort)
|
144
|
+
end
|
145
|
+
|
146
|
+
end
|
12
147
|
end
|
@@ -30,7 +30,7 @@ module Ms
|
|
30
30
|
# Parses a new instance from str. Section after then content-type
|
31
31
|
# declaration are parsed into the parameters hash. Section follow
|
32
32
|
# a simple "key=value\n" pattern.
|
33
|
-
def parse(str)
|
33
|
+
def parse(str, archive=nil)
|
34
34
|
params = {}
|
35
35
|
scanner = StringScanner.new(str)
|
36
36
|
|
@@ -42,12 +42,12 @@ module Ms
|
|
42
42
|
|
43
43
|
# scan each pair.
|
44
44
|
while key = scanner.scan(/[^=]+/)
|
45
|
-
scanner.skip(
|
45
|
+
scanner.skip(/\=/)
|
46
46
|
params[key] = scanner.scan(/[^\n]*/)
|
47
47
|
scanner.skip(/\n/)
|
48
48
|
end
|
49
49
|
|
50
|
-
new(params, section_name)
|
50
|
+
new(params, section_name, archive)
|
51
51
|
end
|
52
52
|
|
53
53
|
# Returns the name of the section represented by this class. Section
|
@@ -67,9 +67,13 @@ module Ms
|
|
67
67
|
# The class section_name.
|
68
68
|
attr_reader :section_name
|
69
69
|
|
70
|
-
|
70
|
+
# A backreference to the dat containing self.
|
71
|
+
attr_reader :dat
|
72
|
+
|
73
|
+
def initialize(data={}, section_name=self.class.section_name, dat=nil)
|
71
74
|
@data = data
|
72
75
|
@section_name = section_name
|
76
|
+
@dat = dat
|
73
77
|
end
|
74
78
|
|
75
79
|
# Formats self as a string with the content-type header.
|
@@ -78,7 +82,7 @@ module Ms
|
|
78
82
|
|
79
83
|
Content-Type: application/x-Mascot; name="#{section_name}"
|
80
84
|
|
81
|
-
#{data.to_a.collect {|entry| TO_S_FORMAT % entry}.join}}
|
85
|
+
#{data.to_a.collect {|entry| self.class::TO_S_FORMAT % entry}.join}}
|
82
86
|
end
|
83
87
|
end
|
84
88
|
end
|
@@ -1,8 +1,226 @@
|
|
1
1
|
require 'ms/mascot/dat/section'
|
2
2
|
|
3
|
-
|
4
|
-
|
5
|
-
#
|
6
|
-
#
|
7
|
-
|
3
|
+
module Ms::Mascot::Dat
|
4
|
+
|
5
|
+
# Summary represent summary identification information in a dat file.
|
6
|
+
# Summaries differ in their meaning depending on the type of search but the
|
7
|
+
# content is in the same format. Currently the APIs for each of these
|
8
|
+
# distinct searches are mashed together although a saner approach would be
|
9
|
+
# to separate them.
|
10
|
+
#
|
11
|
+
# Content-Type: application/x-Mascot; name="summary"
|
12
|
+
#
|
13
|
+
# qmass1=497.265612
|
14
|
+
# qexp1=498.272888,1+
|
15
|
+
# qmatch1=5360
|
16
|
+
# qplughole1=0.000000
|
17
|
+
# qmass2=499.248736
|
18
|
+
# qexp2=500.256012,1+
|
19
|
+
# qmatch2=5759
|
20
|
+
# qplughole2=16.873721
|
21
|
+
# ...
|
22
|
+
# h1=CH60_HUMAN,1.40e+03,0.48,61016.38
|
23
|
+
# h1_text=60 kDa heat shock protein, mitochondrial precursor (Hsp60) (60 kDa chaperonin) (CPN60) (Heat shock
|
24
|
+
# h1_q1=-1
|
25
|
+
# h1_q2=-1
|
26
|
+
# ...
|
27
|
+
# h1_q11=0,832.382767,-0.032939,302,309,6.00,APGFGDNR,16,0000000000,45.35,1,0000002000000000000,0,0,3481.990000
|
28
|
+
# h1_q11_terms=K,K
|
29
|
+
# h1_q12=0,843.506577,-0.034557,345,352,7.00,VGEVIVTK,24,0000000000,45.74,2,0001002000000000000,0,0,1662.450000
|
30
|
+
# h1_q12_terms=K,D
|
31
|
+
# ...
|
32
|
+
#
|
33
|
+
# Summary is a standard Section and simply defines methods for convenient
|
34
|
+
# access. See Section for parsing details.
|
35
|
+
#
|
36
|
+
# === Interpretation
|
37
|
+
#
|
38
|
+
# Deciphering the protein hit information requires some cross-referencing with
|
39
|
+
# online results. Note that each hit references each query.
|
40
|
+
#
|
41
|
+
# hN=protein # protein hit N
|
42
|
+
# hN_text=description # description for hit N
|
43
|
+
# hN_qM=-1 # no peptide from query
|
44
|
+
# hN_qM=query # match for hit N from query M
|
45
|
+
# hN_qM=A,B:C,D # n and c-termini residues for each protein match
|
46
|
+
#
|
47
|
+
# See the ProteinHit and QueryHit structures for interpretation of the
|
48
|
+
# specific hit data.
|
49
|
+
#--
|
50
|
+
#
|
51
|
+
class Summary < Section
|
52
|
+
|
53
|
+
# === ProteinHit
|
54
|
+
#
|
55
|
+
# Represents protein hit data, infered by inspection of the MS/MS sample
|
56
|
+
# results, esp {F981123.dat}[http://www.matrixscience.com/cgi/peptide_view.pl?file=../data/F981123.dat&query=2&hit=1&index=&px=1§ion=5&ave_thresh=38].
|
57
|
+
#
|
58
|
+
# # str: CH60_HUMAN,1.40e+03,0.48,61016.38
|
59
|
+
# # desc: 60 kDa heat shock protein...
|
60
|
+
#
|
61
|
+
# index example meaning
|
62
|
+
# 0 CH60_HUMAN id
|
63
|
+
# 1 1.40e+03
|
64
|
+
# 2 0.48
|
65
|
+
# 3 61016.38 mass
|
66
|
+
# 4 60 kDa heat... text
|
67
|
+
#
|
68
|
+
ProteinHit = Struct.new(
|
69
|
+
:id,
|
70
|
+
:unknown1,
|
71
|
+
:unknown2,
|
72
|
+
:mass,
|
73
|
+
:text,
|
74
|
+
:query_hits
|
75
|
+
)
|
76
|
+
|
77
|
+
# Indicies of ProteinHit terms that will be cast to floats.
|
78
|
+
ProteinHitFloatIndicies = [1,2,3]
|
79
|
+
|
80
|
+
# === QueryHit
|
81
|
+
#
|
82
|
+
# Represents query data, infered by inspection of the MS/MS sample
|
83
|
+
# results, esp {F981123.dat}[http://www.matrixscience.com/cgi/peptide_view.pl?file=../data/F981123.dat&query=2&hit=1&index=&px=1§ion=5&ave_thresh=38].
|
84
|
+
#
|
85
|
+
# # str: 0,832.382767,-0.032939,302,309,6.00,APGFGDNR,16,0000000000,45.35,1,0000002000000000000,0,0,3481.990000
|
86
|
+
# # terms: K,R
|
87
|
+
#
|
88
|
+
# index example meaning
|
89
|
+
# 0 0 n Missed Cleavages
|
90
|
+
# 1 832.382767 Monoisotopic mass of neutral peptide Mr(calc)
|
91
|
+
# 2 -0.032939 actual - theoretical delta mass
|
92
|
+
# 3 302 peptide start index
|
93
|
+
# 4 309 peptide end index
|
94
|
+
# 5 6.00
|
95
|
+
# 6 APGFGDNR peptide sequence
|
96
|
+
# 7 16
|
97
|
+
# 8 0000000000 modification sites (including n,c residues; number indicates mod)
|
98
|
+
# 9 45.35 score
|
99
|
+
# 10 1
|
100
|
+
# 11 0000002000000000000
|
101
|
+
# 12 0
|
102
|
+
# 13 0
|
103
|
+
# 14 3481.990000
|
104
|
+
# 15 K nterm
|
105
|
+
# 16 R cterm
|
106
|
+
#
|
107
|
+
# The dat file is said to be generate by Mascot version 1.0, but the headers
|
108
|
+
# section records 2.1.119.
|
109
|
+
QueryHit = Struct.new(
|
110
|
+
:n_missed_cleavages,
|
111
|
+
:peptide_mass,
|
112
|
+
:delta_mass,
|
113
|
+
:peptide_start,
|
114
|
+
:peptide_end,
|
115
|
+
:unknown5,
|
116
|
+
:sequence,
|
117
|
+
:unknown7,
|
118
|
+
:modifications,
|
119
|
+
:score,
|
120
|
+
:unknown10,
|
121
|
+
:unknown11,
|
122
|
+
:unknown12,
|
123
|
+
:unknown13,
|
124
|
+
:unknown14,
|
125
|
+
:nterm,
|
126
|
+
:cterm
|
127
|
+
)
|
128
|
+
|
129
|
+
# Indicies of QueryHit terms that will be cast to floats.
|
130
|
+
QueryHitFloatIndicies = [1,2,5,9,14]
|
131
|
+
|
132
|
+
# Indicies of QueryHit terms that will be cast to integers.
|
133
|
+
QueryHitIntIndicies = [0,3,4,7,10,12,13]
|
134
|
+
|
135
|
+
module Utils
|
136
|
+
module_function
|
137
|
+
|
138
|
+
# Parses a ProteinHit from the hit string.
|
139
|
+
def parse_protein_hit(str, desc, query_hits)
|
140
|
+
data = str.split(",")
|
141
|
+
ProteinHitFloatIndicies.each do |index|
|
142
|
+
data[index] = data[index].to_f
|
143
|
+
end
|
144
|
+
data << desc
|
145
|
+
data << query_hits
|
146
|
+
|
147
|
+
ProteinHit.new(*data)
|
148
|
+
end
|
149
|
+
|
150
|
+
# Parses a QueryHit from the hit-query string.
|
151
|
+
def parse_query_hit(str, terms)
|
152
|
+
return nil if str == nil || str == "-1"
|
153
|
+
|
154
|
+
data = str.split(",") + terms.split(",")
|
155
|
+
QueryHitFloatIndicies.each do |index|
|
156
|
+
data[index] = data[index].to_f
|
157
|
+
end
|
158
|
+
QueryHitIntIndicies.each do |index|
|
159
|
+
data[index] = data[index].to_i
|
160
|
+
end
|
161
|
+
|
162
|
+
QueryHit.new(*data)
|
163
|
+
end
|
164
|
+
end
|
165
|
+
|
166
|
+
include Utils
|
167
|
+
|
168
|
+
def initialize(data={}, section_name=self.class.section_name, dat=nil)
|
169
|
+
super(data, section_name, dat)
|
170
|
+
@protein_hits = []
|
171
|
+
@query_hits = []
|
172
|
+
end
|
173
|
+
|
174
|
+
# An array of protein hits. Specify resolve=false to return just the
|
175
|
+
# currently parsed hits.
|
176
|
+
#
|
177
|
+
# Note that the hits array is indexed the same as in Mascot, ie the
|
178
|
+
# ProteinHit for h1 is located at hits[1], meaning there is always
|
179
|
+
# an empty cell at hits[0].
|
180
|
+
def protein_hits(resolve=true)
|
181
|
+
return @protein_hits unless resolve
|
182
|
+
|
183
|
+
hit = 1
|
184
|
+
hit += 1 while protein_hit(hit)
|
185
|
+
@protein_hits
|
186
|
+
end
|
187
|
+
|
188
|
+
# Returns a ProteinHit at the hit index, or nil if no such hit exists.
|
189
|
+
def protein_hit(hit)
|
190
|
+
key = "h#{hit}"
|
191
|
+
return nil unless str = data[key]
|
192
|
+
@protein_hits[hit] ||= parse_protein_hit(str, data["#{key}_text"], query_hits(hit))
|
193
|
+
end
|
194
|
+
|
195
|
+
# Returns an array of QueryHits for the specified hit, or nil if no
|
196
|
+
# such hit exists.
|
197
|
+
def query_hits(hit)
|
198
|
+
query = 1
|
199
|
+
while data.has_key?("h#{hit}_q#{query}")
|
200
|
+
query_hit(hit, query)
|
201
|
+
query += 1
|
202
|
+
end
|
203
|
+
|
204
|
+
@query_hits[hit]
|
205
|
+
end
|
206
|
+
|
207
|
+
# Returns the QueryHit at the hit and query index, or nil if no such query
|
208
|
+
# exists.
|
209
|
+
def query_hit(hit, query)
|
210
|
+
key = "h#{hit}_q#{query}"
|
211
|
+
return nil unless data.has_key?(key)
|
212
|
+
|
213
|
+
queries = @query_hits[hit] ||= []
|
214
|
+
if existing_query = queries[query]
|
215
|
+
return existing_query
|
216
|
+
end
|
217
|
+
|
218
|
+
if parsed_query = parse_query_hit(data[key], data["#{key}_terms"])
|
219
|
+
queries[query] = parsed_query
|
220
|
+
return parsed_query
|
221
|
+
end
|
222
|
+
|
223
|
+
nil
|
224
|
+
end
|
225
|
+
end
|
8
226
|
end
|