ms-mascot 0.2.0 → 0.2.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/History +11 -1
- data/lib/ms/mascot.rb +4 -1
- data/lib/ms/mascot/dat/archive.rb +85 -8
- data/lib/ms/mascot/dat/header.rb +12 -0
- data/lib/ms/mascot/dat/index.rb +13 -19
- data/lib/ms/mascot/dat/masses.rb +14 -0
- data/lib/ms/mascot/dat/parameters.rb +14 -0
- data/lib/ms/mascot/dat/peptides.rb +211 -2
- data/lib/ms/mascot/dat/proteins.rb +73 -1
- data/lib/ms/mascot/dat/query.rb +141 -6
- data/lib/ms/mascot/dat/section.rb +9 -5
- data/lib/ms/mascot/dat/summary.rb +223 -5
- data/lib/ms/mascot/export.rb +55 -72
- data/lib/ms/mascot/format_mgf.rb +7 -8
- data/lib/ms/mascot/fragment.rb +1 -1
- data/lib/ms/mascot/mgf.rb +32 -2
- data/lib/ms/mascot/mgf/archive.rb +8 -1
- data/lib/ms/mascot/submit.rb +52 -69
- data/lib/ms/mascot/validation.rb +17 -0
- metadata +7 -6
@@ -1,4 +1,76 @@
|
|
1
1
|
require 'ms/mascot/dat/section'
|
2
2
|
|
3
|
-
|
3
|
+
module Ms::Mascot::Dat
|
4
|
+
|
5
|
+
# Proteins represent supplementary protein information in a dat file.
|
6
|
+
#
|
7
|
+
# Content-Type: application/x-Mascot; name="proteins"
|
8
|
+
#
|
9
|
+
# "ZN711_HUMAN"=87153.77,"Zinc finger protein 711 (Zinc finger protein 6) - Homo sapiens (Human)"
|
10
|
+
# "Y986_MYCTU"=27356.31,"Hypothetical ABC transporter ATP-binding protein Rv0986/MT1014 - Mycobacterium tuberculosis"
|
11
|
+
# "Y5G0_ENCCU"=33509.30,"Hypothetical protein ECU05_1600/ECU11_0130 - Encephalitozoon cuniculi"
|
12
|
+
#
|
13
|
+
# Proteins is (almost) a standard Section and defines methods for convenient
|
14
|
+
# access.
|
15
|
+
class Proteins < Section
|
16
|
+
|
17
|
+
# === Protein
|
18
|
+
#
|
19
|
+
# Represents protein data.
|
20
|
+
#
|
21
|
+
# # 87153.77,"Zinc finger protein 711 (Zinc finger protein 6) - Homo sapiens (Human)"
|
22
|
+
#
|
23
|
+
# index example meaning
|
24
|
+
# 0 87153.77 protein mass in Da
|
25
|
+
# 1 "Zinc finger..." a description string
|
26
|
+
#
|
27
|
+
Protein = Struct.new(
|
28
|
+
:mass,
|
29
|
+
:description
|
30
|
+
)
|
31
|
+
|
32
|
+
# A format string used to format parameters as a string.
|
33
|
+
TO_S_FORMAT = "\"%s\"=%s\n"
|
34
|
+
|
35
|
+
class << self
|
36
|
+
|
37
|
+
# Parses a new instance from str. Special parsing is required to quickly
|
38
|
+
# remove the quotes from protein keys.
|
39
|
+
def parse(str, archive=nil)
|
40
|
+
params = {}
|
41
|
+
scanner = StringScanner.new(str)
|
42
|
+
|
43
|
+
# skip whitespace and content type declaration
|
44
|
+
unless scanner.scan(Section::CONTENT_TYPE_REGEXP)
|
45
|
+
raise "unknown content type: #{content_type}"
|
46
|
+
end
|
47
|
+
section_name = scanner[1]
|
48
|
+
|
49
|
+
# scan each pair removing quotes from keys
|
50
|
+
while true
|
51
|
+
scanner.skip(/"/)
|
52
|
+
break unless key = scanner.scan(/[^"]+/)
|
53
|
+
scanner.skip(/"\=/)
|
54
|
+
params[key] = scanner.scan(/[^\n]*/)
|
55
|
+
scanner.skip(/\n/)
|
56
|
+
end
|
57
|
+
|
58
|
+
new(params, section_name, archive)
|
59
|
+
end
|
60
|
+
end
|
61
|
+
|
62
|
+
# Returns a Protein for the specified protein id.
|
63
|
+
def protein(id)
|
64
|
+
parse_protein(data[id])
|
65
|
+
end
|
66
|
+
|
67
|
+
private
|
68
|
+
|
69
|
+
# Parses a Protein from the protien data string.
|
70
|
+
def parse_protein(str)
|
71
|
+
return nil unless str
|
72
|
+
mass, description = str.split(',')
|
73
|
+
Protein.new(mass.to_f, description[1...-1])
|
74
|
+
end
|
75
|
+
end
|
4
76
|
end
|
data/lib/ms/mascot/dat/query.rb
CHANGED
@@ -1,12 +1,147 @@
|
|
1
1
|
require 'ms/mascot/dat/section'
|
2
|
+
require 'ms/mascot/mgf/entry'
|
3
|
+
require 'rack'
|
2
4
|
|
3
|
-
|
5
|
+
module Ms::Mascot::Dat
|
4
6
|
|
5
|
-
|
7
|
+
# Query is a generic section for all queryN sections. Query contains query
|
8
|
+
# data that has different meaning depending on the type of search performed.
|
9
|
+
# Here is data from an MS/MS search:
|
10
|
+
#
|
11
|
+
# Content-Type: application/x-Mascot; name="query60"
|
12
|
+
#
|
13
|
+
# charge=3+
|
14
|
+
# mass_min=50.175000
|
15
|
+
# mass_max=1998.960000
|
16
|
+
# int_min=0.0364
|
17
|
+
# int_max=7366
|
18
|
+
# num_vals=3411
|
19
|
+
# num_used1=-1
|
20
|
+
# Ions1=129.098825:384.8,187.070000:461.5...
|
21
|
+
# ...
|
22
|
+
#
|
23
|
+
# Query is a standard Section and simply defines methods for convenient
|
24
|
+
# access. See Section for parsing details.
|
25
|
+
class Query < Ms::Mascot::Dat::Section
|
6
26
|
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
|
27
|
+
module Utils
|
28
|
+
module_function
|
29
|
+
|
30
|
+
# Scans an ion string for values, yielding each number as a string and the
|
31
|
+
# a flag signaling whether or not the number marks the end of a datapoint
|
32
|
+
# (ie the number is the intensity).
|
33
|
+
#
|
34
|
+
# str = "\nReformatted Ions\n"
|
35
|
+
# Query.scan_ions('1.23:4.56,7.8:9') do |num, end_point|
|
36
|
+
# str << num
|
37
|
+
# str << (end_point ? "\n" : " ")
|
38
|
+
# end
|
39
|
+
#
|
40
|
+
# str
|
41
|
+
# # => %q{
|
42
|
+
# # Reformatted Ions
|
43
|
+
# # 1.23 4.56
|
44
|
+
# # 7.8 9
|
45
|
+
# # }
|
46
|
+
#
|
47
|
+
def scan_ions(str) # :yields: num, end_point
|
48
|
+
scanner = StringScanner.new(str)
|
49
|
+
while num = scanner.scan(/[^:,]+/)
|
50
|
+
if scanner.skip(/:/)
|
51
|
+
yield(num, false)
|
52
|
+
else
|
53
|
+
scanner.skip(/,/)
|
54
|
+
yield(num, true)
|
55
|
+
end
|
56
|
+
end
|
57
|
+
end
|
58
|
+
|
59
|
+
# Parses an ion string into a simple data array. Parse ions requires
|
60
|
+
# data points be separated with a comma and mz/intensity values with a
|
61
|
+
# semicolon, but is tolerant to integer and floats.
|
62
|
+
#
|
63
|
+
# Query.parse_ions('1.23:4.56,7.8:9') # => [[1.23, 4.56], [7.8, 9]]
|
64
|
+
#
|
65
|
+
# All ions are cast to floats; see scan_ions for scanning the string
|
66
|
+
# values.
|
67
|
+
def parse_ions(str)
|
68
|
+
ions = []
|
69
|
+
current = []
|
70
|
+
|
71
|
+
scan_ions(str) do |num, end_point|
|
72
|
+
current << num.to_f
|
73
|
+
|
74
|
+
if end_point
|
75
|
+
ions << current
|
76
|
+
current = []
|
77
|
+
end
|
78
|
+
end
|
79
|
+
ions
|
80
|
+
end
|
81
|
+
end
|
82
|
+
|
83
|
+
include Utils
|
84
|
+
|
85
|
+
# Returns the query index for self (ie 60 when section_name is 'query60')
|
86
|
+
attr_reader :index
|
87
|
+
|
88
|
+
def initialize(data={}, section_name=self.class.section_name, dat=nil)
|
89
|
+
super(data, section_name, dat)
|
90
|
+
data['title'] = Rack::Utils.unescape(data['title'].to_s)
|
91
|
+
@index = section_name.strip[5..-1].to_i
|
92
|
+
@ions=[]
|
93
|
+
end
|
94
|
+
|
95
|
+
# Returns the nth ion string in self.
|
96
|
+
def ion_str(n=1)
|
97
|
+
data["Ions#{n}"]
|
98
|
+
end
|
99
|
+
|
100
|
+
# Returns a simple array of the parsed nth ion string.
|
101
|
+
def ions(n=1)
|
102
|
+
@ions[n] ||= parse_ions(ion_str(n))
|
103
|
+
end
|
11
104
|
|
105
|
+
def title
|
106
|
+
data['title']
|
107
|
+
end
|
108
|
+
|
109
|
+
# allows access to values in data with method calls
|
110
|
+
#def method_missing(*args)
|
111
|
+
# if args.size == 1 && (val = data[arg.to_s])
|
112
|
+
# val
|
113
|
+
# else
|
114
|
+
# super(*args)
|
115
|
+
# end
|
116
|
+
#end
|
117
|
+
|
118
|
+
# returns a Ms::Mascot::Mgf::Entry object.
|
119
|
+
# pepmass may be a Numeric OR a PeptideHit object (extracting the pepmass
|
120
|
+
# by PeptideHit#peptide_mass + PeptideHit#delta_mass
|
121
|
+
# options are:
|
122
|
+
#
|
123
|
+
# :valid_headers = true (default) | false
|
124
|
+
def to_mgf(pepmass, opts={})
|
125
|
+
opts = {:valid_headers => true}.merge(opts)
|
126
|
+
valid_headers = opts[:valid_headers]
|
127
|
+
header = {}
|
128
|
+
header['PEPMASS'] =
|
129
|
+
if pepmass.is_a? Numeric
|
130
|
+
pepmass
|
131
|
+
else
|
132
|
+
hit = pepmass
|
133
|
+
hit.peptide_mass + hit.delta_mass
|
134
|
+
end
|
135
|
+
data.each_pair do |key,value|
|
136
|
+
up = key.to_s.upcase
|
137
|
+
next if key =~ /Ions/
|
138
|
+
next if valid_headers && !Ms::Mascot::Mgf::VALID_LOCAL_HEADERS.include?(up)
|
139
|
+
header[up] = value
|
140
|
+
end
|
141
|
+
# note that we sort the ions because I think I've seen files without
|
142
|
+
# them being sorted
|
143
|
+
Ms::Mascot::Mgf::Entry.new(header, self.ions.sort)
|
144
|
+
end
|
145
|
+
|
146
|
+
end
|
12
147
|
end
|
@@ -30,7 +30,7 @@ module Ms
|
|
30
30
|
# Parses a new instance from str. Section after then content-type
|
31
31
|
# declaration are parsed into the parameters hash. Section follow
|
32
32
|
# a simple "key=value\n" pattern.
|
33
|
-
def parse(str)
|
33
|
+
def parse(str, archive=nil)
|
34
34
|
params = {}
|
35
35
|
scanner = StringScanner.new(str)
|
36
36
|
|
@@ -42,12 +42,12 @@ module Ms
|
|
42
42
|
|
43
43
|
# scan each pair.
|
44
44
|
while key = scanner.scan(/[^=]+/)
|
45
|
-
scanner.skip(
|
45
|
+
scanner.skip(/\=/)
|
46
46
|
params[key] = scanner.scan(/[^\n]*/)
|
47
47
|
scanner.skip(/\n/)
|
48
48
|
end
|
49
49
|
|
50
|
-
new(params, section_name)
|
50
|
+
new(params, section_name, archive)
|
51
51
|
end
|
52
52
|
|
53
53
|
# Returns the name of the section represented by this class. Section
|
@@ -67,9 +67,13 @@ module Ms
|
|
67
67
|
# The class section_name.
|
68
68
|
attr_reader :section_name
|
69
69
|
|
70
|
-
|
70
|
+
# A backreference to the dat containing self.
|
71
|
+
attr_reader :dat
|
72
|
+
|
73
|
+
def initialize(data={}, section_name=self.class.section_name, dat=nil)
|
71
74
|
@data = data
|
72
75
|
@section_name = section_name
|
76
|
+
@dat = dat
|
73
77
|
end
|
74
78
|
|
75
79
|
# Formats self as a string with the content-type header.
|
@@ -78,7 +82,7 @@ module Ms
|
|
78
82
|
|
79
83
|
Content-Type: application/x-Mascot; name="#{section_name}"
|
80
84
|
|
81
|
-
#{data.to_a.collect {|entry| TO_S_FORMAT % entry}.join}}
|
85
|
+
#{data.to_a.collect {|entry| self.class::TO_S_FORMAT % entry}.join}}
|
82
86
|
end
|
83
87
|
end
|
84
88
|
end
|
@@ -1,8 +1,226 @@
|
|
1
1
|
require 'ms/mascot/dat/section'
|
2
2
|
|
3
|
-
|
4
|
-
|
5
|
-
#
|
6
|
-
#
|
7
|
-
|
3
|
+
module Ms::Mascot::Dat
|
4
|
+
|
5
|
+
# Summary represent summary identification information in a dat file.
|
6
|
+
# Summaries differ in their meaning depending on the type of search but the
|
7
|
+
# content is in the same format. Currently the APIs for each of these
|
8
|
+
# distinct searches are mashed together although a saner approach would be
|
9
|
+
# to separate them.
|
10
|
+
#
|
11
|
+
# Content-Type: application/x-Mascot; name="summary"
|
12
|
+
#
|
13
|
+
# qmass1=497.265612
|
14
|
+
# qexp1=498.272888,1+
|
15
|
+
# qmatch1=5360
|
16
|
+
# qplughole1=0.000000
|
17
|
+
# qmass2=499.248736
|
18
|
+
# qexp2=500.256012,1+
|
19
|
+
# qmatch2=5759
|
20
|
+
# qplughole2=16.873721
|
21
|
+
# ...
|
22
|
+
# h1=CH60_HUMAN,1.40e+03,0.48,61016.38
|
23
|
+
# h1_text=60 kDa heat shock protein, mitochondrial precursor (Hsp60) (60 kDa chaperonin) (CPN60) (Heat shock
|
24
|
+
# h1_q1=-1
|
25
|
+
# h1_q2=-1
|
26
|
+
# ...
|
27
|
+
# h1_q11=0,832.382767,-0.032939,302,309,6.00,APGFGDNR,16,0000000000,45.35,1,0000002000000000000,0,0,3481.990000
|
28
|
+
# h1_q11_terms=K,K
|
29
|
+
# h1_q12=0,843.506577,-0.034557,345,352,7.00,VGEVIVTK,24,0000000000,45.74,2,0001002000000000000,0,0,1662.450000
|
30
|
+
# h1_q12_terms=K,D
|
31
|
+
# ...
|
32
|
+
#
|
33
|
+
# Summary is a standard Section and simply defines methods for convenient
|
34
|
+
# access. See Section for parsing details.
|
35
|
+
#
|
36
|
+
# === Interpretation
|
37
|
+
#
|
38
|
+
# Deciphering the protein hit information requires some cross-referencing with
|
39
|
+
# online results. Note that each hit references each query.
|
40
|
+
#
|
41
|
+
# hN=protein # protein hit N
|
42
|
+
# hN_text=description # description for hit N
|
43
|
+
# hN_qM=-1 # no peptide from query
|
44
|
+
# hN_qM=query # match for hit N from query M
|
45
|
+
# hN_qM=A,B:C,D # n and c-termini residues for each protein match
|
46
|
+
#
|
47
|
+
# See the ProteinHit and QueryHit structures for interpretation of the
|
48
|
+
# specific hit data.
|
49
|
+
#--
|
50
|
+
#
|
51
|
+
class Summary < Section
|
52
|
+
|
53
|
+
# === ProteinHit
|
54
|
+
#
|
55
|
+
# Represents protein hit data, infered by inspection of the MS/MS sample
|
56
|
+
# results, esp {F981123.dat}[http://www.matrixscience.com/cgi/peptide_view.pl?file=../data/F981123.dat&query=2&hit=1&index=&px=1§ion=5&ave_thresh=38].
|
57
|
+
#
|
58
|
+
# # str: CH60_HUMAN,1.40e+03,0.48,61016.38
|
59
|
+
# # desc: 60 kDa heat shock protein...
|
60
|
+
#
|
61
|
+
# index example meaning
|
62
|
+
# 0 CH60_HUMAN id
|
63
|
+
# 1 1.40e+03
|
64
|
+
# 2 0.48
|
65
|
+
# 3 61016.38 mass
|
66
|
+
# 4 60 kDa heat... text
|
67
|
+
#
|
68
|
+
ProteinHit = Struct.new(
|
69
|
+
:id,
|
70
|
+
:unknown1,
|
71
|
+
:unknown2,
|
72
|
+
:mass,
|
73
|
+
:text,
|
74
|
+
:query_hits
|
75
|
+
)
|
76
|
+
|
77
|
+
# Indicies of ProteinHit terms that will be cast to floats.
|
78
|
+
ProteinHitFloatIndicies = [1,2,3]
|
79
|
+
|
80
|
+
# === QueryHit
|
81
|
+
#
|
82
|
+
# Represents query data, infered by inspection of the MS/MS sample
|
83
|
+
# results, esp {F981123.dat}[http://www.matrixscience.com/cgi/peptide_view.pl?file=../data/F981123.dat&query=2&hit=1&index=&px=1§ion=5&ave_thresh=38].
|
84
|
+
#
|
85
|
+
# # str: 0,832.382767,-0.032939,302,309,6.00,APGFGDNR,16,0000000000,45.35,1,0000002000000000000,0,0,3481.990000
|
86
|
+
# # terms: K,R
|
87
|
+
#
|
88
|
+
# index example meaning
|
89
|
+
# 0 0 n Missed Cleavages
|
90
|
+
# 1 832.382767 Monoisotopic mass of neutral peptide Mr(calc)
|
91
|
+
# 2 -0.032939 actual - theoretical delta mass
|
92
|
+
# 3 302 peptide start index
|
93
|
+
# 4 309 peptide end index
|
94
|
+
# 5 6.00
|
95
|
+
# 6 APGFGDNR peptide sequence
|
96
|
+
# 7 16
|
97
|
+
# 8 0000000000 modification sites (including n,c residues; number indicates mod)
|
98
|
+
# 9 45.35 score
|
99
|
+
# 10 1
|
100
|
+
# 11 0000002000000000000
|
101
|
+
# 12 0
|
102
|
+
# 13 0
|
103
|
+
# 14 3481.990000
|
104
|
+
# 15 K nterm
|
105
|
+
# 16 R cterm
|
106
|
+
#
|
107
|
+
# The dat file is said to be generate by Mascot version 1.0, but the headers
|
108
|
+
# section records 2.1.119.
|
109
|
+
QueryHit = Struct.new(
|
110
|
+
:n_missed_cleavages,
|
111
|
+
:peptide_mass,
|
112
|
+
:delta_mass,
|
113
|
+
:peptide_start,
|
114
|
+
:peptide_end,
|
115
|
+
:unknown5,
|
116
|
+
:sequence,
|
117
|
+
:unknown7,
|
118
|
+
:modifications,
|
119
|
+
:score,
|
120
|
+
:unknown10,
|
121
|
+
:unknown11,
|
122
|
+
:unknown12,
|
123
|
+
:unknown13,
|
124
|
+
:unknown14,
|
125
|
+
:nterm,
|
126
|
+
:cterm
|
127
|
+
)
|
128
|
+
|
129
|
+
# Indicies of QueryHit terms that will be cast to floats.
|
130
|
+
QueryHitFloatIndicies = [1,2,5,9,14]
|
131
|
+
|
132
|
+
# Indicies of QueryHit terms that will be cast to integers.
|
133
|
+
QueryHitIntIndicies = [0,3,4,7,10,12,13]
|
134
|
+
|
135
|
+
module Utils
|
136
|
+
module_function
|
137
|
+
|
138
|
+
# Parses a ProteinHit from the hit string.
|
139
|
+
def parse_protein_hit(str, desc, query_hits)
|
140
|
+
data = str.split(",")
|
141
|
+
ProteinHitFloatIndicies.each do |index|
|
142
|
+
data[index] = data[index].to_f
|
143
|
+
end
|
144
|
+
data << desc
|
145
|
+
data << query_hits
|
146
|
+
|
147
|
+
ProteinHit.new(*data)
|
148
|
+
end
|
149
|
+
|
150
|
+
# Parses a QueryHit from the hit-query string.
|
151
|
+
def parse_query_hit(str, terms)
|
152
|
+
return nil if str == nil || str == "-1"
|
153
|
+
|
154
|
+
data = str.split(",") + terms.split(",")
|
155
|
+
QueryHitFloatIndicies.each do |index|
|
156
|
+
data[index] = data[index].to_f
|
157
|
+
end
|
158
|
+
QueryHitIntIndicies.each do |index|
|
159
|
+
data[index] = data[index].to_i
|
160
|
+
end
|
161
|
+
|
162
|
+
QueryHit.new(*data)
|
163
|
+
end
|
164
|
+
end
|
165
|
+
|
166
|
+
include Utils
|
167
|
+
|
168
|
+
def initialize(data={}, section_name=self.class.section_name, dat=nil)
|
169
|
+
super(data, section_name, dat)
|
170
|
+
@protein_hits = []
|
171
|
+
@query_hits = []
|
172
|
+
end
|
173
|
+
|
174
|
+
# An array of protein hits. Specify resolve=false to return just the
|
175
|
+
# currently parsed hits.
|
176
|
+
#
|
177
|
+
# Note that the hits array is indexed the same as in Mascot, ie the
|
178
|
+
# ProteinHit for h1 is located at hits[1], meaning there is always
|
179
|
+
# an empty cell at hits[0].
|
180
|
+
def protein_hits(resolve=true)
|
181
|
+
return @protein_hits unless resolve
|
182
|
+
|
183
|
+
hit = 1
|
184
|
+
hit += 1 while protein_hit(hit)
|
185
|
+
@protein_hits
|
186
|
+
end
|
187
|
+
|
188
|
+
# Returns a ProteinHit at the hit index, or nil if no such hit exists.
|
189
|
+
def protein_hit(hit)
|
190
|
+
key = "h#{hit}"
|
191
|
+
return nil unless str = data[key]
|
192
|
+
@protein_hits[hit] ||= parse_protein_hit(str, data["#{key}_text"], query_hits(hit))
|
193
|
+
end
|
194
|
+
|
195
|
+
# Returns an array of QueryHits for the specified hit, or nil if no
|
196
|
+
# such hit exists.
|
197
|
+
def query_hits(hit)
|
198
|
+
query = 1
|
199
|
+
while data.has_key?("h#{hit}_q#{query}")
|
200
|
+
query_hit(hit, query)
|
201
|
+
query += 1
|
202
|
+
end
|
203
|
+
|
204
|
+
@query_hits[hit]
|
205
|
+
end
|
206
|
+
|
207
|
+
# Returns the QueryHit at the hit and query index, or nil if no such query
|
208
|
+
# exists.
|
209
|
+
def query_hit(hit, query)
|
210
|
+
key = "h#{hit}_q#{query}"
|
211
|
+
return nil unless data.has_key?(key)
|
212
|
+
|
213
|
+
queries = @query_hits[hit] ||= []
|
214
|
+
if existing_query = queries[query]
|
215
|
+
return existing_query
|
216
|
+
end
|
217
|
+
|
218
|
+
if parsed_query = parse_query_hit(data[key], data["#{key}_terms"])
|
219
|
+
queries[query] = parsed_query
|
220
|
+
return parsed_query
|
221
|
+
end
|
222
|
+
|
223
|
+
nil
|
224
|
+
end
|
225
|
+
end
|
8
226
|
end
|