mascot-dat 0.1.3 → 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/.gitignore +1 -0
- data/lib/mascot/dat/peptides.rb +10 -1
- data/lib/mascot/dat/psm.rb +3 -1
- data/lib/mascot/dat/query.rb +118 -0
- data/lib/mascot/dat/version.rb +1 -1
- data/lib/mascot/dat.rb +5 -39
- data/test/test_mascot-dat-query.rb +65 -0
- data/test/test_mascot-dat.rb +4 -6
- metadata +11 -8
data/lib/mascot/dat/peptides.rb
CHANGED
@@ -6,7 +6,7 @@ module Mascot
|
|
6
6
|
# access this section as one big chunk in memory. It is often quite large and
|
7
7
|
# needs to be accessed using Enumerable methods.
|
8
8
|
#
|
9
|
-
# From the Mascot documentation,
|
9
|
+
# From the Mascot documentation, the following represents a reasonably complete PSM
|
10
10
|
# q1_p1_db=01 # two digit integer of the search DB index, zero filled and retarded.
|
11
11
|
# q1_p1=missed cleavages, (–1 indicates no match)
|
12
12
|
# peptide Mr,
|
@@ -76,11 +76,17 @@ module Mascot
|
|
76
76
|
@file.pos = @byteoffset + @boundary_line.length
|
77
77
|
end
|
78
78
|
|
79
|
+
# Return a specific {Mascot::DAT::PSM} identified for query <code>q</code> and peptide number <code>p</code>
|
80
|
+
# @param q Fixnum
|
81
|
+
# @param p Fixnum
|
82
|
+
# @return Mascot::DAT::PSM
|
79
83
|
def psm q,p
|
80
84
|
@file.pos = @psmidx[q][p]
|
81
85
|
next_psm
|
82
86
|
end
|
83
87
|
|
88
|
+
# Returns the next {Mascot::DAT::PSM} from the DAT file. If there is no other PSM, then it returns nil.
|
89
|
+
# @return Mascot::DAT::PSM
|
84
90
|
def next_psm
|
85
91
|
return nil if @file.pos >= @endbytepos
|
86
92
|
# get the initial values for query & rank
|
@@ -106,7 +112,10 @@ module Mascot
|
|
106
112
|
Mascot::DAT::PSM.parse(tmp)
|
107
113
|
end
|
108
114
|
|
115
|
+
# Iterate through all of the {Mascot::DAT::PSM} entries in the DAT file.
|
116
|
+
# @return Enumerator
|
109
117
|
def each
|
118
|
+
@file.pos = @byteoffset
|
110
119
|
while @file.pos < @endbytepos
|
111
120
|
psm = next_psm()
|
112
121
|
next if psm.nil?
|
data/lib/mascot/dat/psm.rb
CHANGED
@@ -34,8 +34,9 @@ module Mascot
|
|
34
34
|
end
|
35
35
|
def self.parse psm_arr
|
36
36
|
psm_result = self.new()
|
37
|
-
|
38
37
|
psm_arr.each do |l|
|
38
|
+
next unless l =~ /^q/
|
39
|
+
|
39
40
|
k,v = l.split "="
|
40
41
|
case k
|
41
42
|
when /^q(\d+)_p(\d+)$/
|
@@ -69,6 +70,7 @@ module Mascot
|
|
69
70
|
psm_result.terms = v.split(":").collect {|t| t.split(",") }
|
70
71
|
else
|
71
72
|
# returns the smaller key
|
73
|
+
puts "****#{k}***"
|
72
74
|
k_sym = k.slice(/q\d+_p\d+_?(.+)/,1).to_sym
|
73
75
|
psm_result.attrs[k_sym] = v
|
74
76
|
end
|
@@ -0,0 +1,118 @@
|
|
1
|
+
module Mascot
|
2
|
+
class DAT
|
3
|
+
|
4
|
+
# A class to represent mass spectrum query objects in Mascot DAT files.
|
5
|
+
# Here is an example:
|
6
|
+
#
|
7
|
+
# --gc0p4Jq0M2Yt08jU534c0p
|
8
|
+
# Content-Type: application/x-Mascot; name="query3"
|
9
|
+
#
|
10
|
+
# title=253%2e131203405971_503
|
11
|
+
# rtinseconds=503
|
12
|
+
# index=5
|
13
|
+
# charge=2+
|
14
|
+
# mass_min=88.063115
|
15
|
+
# mass_max=392.171066
|
16
|
+
# int_min=6.064e+05
|
17
|
+
# int_max=6.064e+05
|
18
|
+
# num_vals=10
|
19
|
+
# num_used1=-1
|
20
|
+
# Ions1=88.063115:6.064e+05,196.589171:6.064e+05,331.143454:6.064e+05,392.171066:6.064e+05,114.570773:6.064e+05,228.134269:6.064e+05,139.567707:6.064e+05,278.128138:6.064e+05,166.075365:6.064e+05,175.118953:6.064e+05
|
21
|
+
#
|
22
|
+
# Things to note are:
|
23
|
+
#
|
24
|
+
# * the spectrum title is encoded to produce nice output in HTML
|
25
|
+
# * the m/z and intensity values are given as pairs of values
|
26
|
+
# * the m/z and intensity values are not in increasing values of m/z
|
27
|
+
#
|
28
|
+
# This parser accounts for these in the attributes like so:
|
29
|
+
#
|
30
|
+
# * spectrum title is de-encoded
|
31
|
+
# * the pairs of m/z and intensity are accessible via the {#peaks} method
|
32
|
+
# * the {#peaks} are ordered in accordance to increasing m/z
|
33
|
+
# * there are {#mz} and {#intensity} methods to get the individual array of values for each
|
34
|
+
#
|
35
|
+
class Query
|
36
|
+
# The name of the query in Mascot DAT file, e.g. the MIME section header
|
37
|
+
attr_reader :name
|
38
|
+
# The spectrum title from the source mass spectrum file
|
39
|
+
attr_reader :title
|
40
|
+
# No clue what this is
|
41
|
+
attr_reader :index
|
42
|
+
# Retention time in seconds
|
43
|
+
attr_reader :rtinseconds
|
44
|
+
# Charge state of the parent MS1 ion
|
45
|
+
attr_reader :charge
|
46
|
+
# The minimum m/z of the values
|
47
|
+
attr_reader :mass_min
|
48
|
+
# The maximum m/z of the values
|
49
|
+
attr_reader :mass_max
|
50
|
+
# The minimum intensity of the values
|
51
|
+
attr_reader :int_min
|
52
|
+
# The maximum intensity of the values
|
53
|
+
attr_reader :int_max
|
54
|
+
# The number of peaks
|
55
|
+
attr_reader :num_vals
|
56
|
+
# No clue what this is
|
57
|
+
attr_reader :num_used1
|
58
|
+
# An Array of [m/z, intensity] tuples, ordered by increasing m/z values
|
59
|
+
attr_reader :peaks
|
60
|
+
# An Array of m/z values, ordered by increasing m/z
|
61
|
+
attr_reader :mz
|
62
|
+
# An Array of intensity values, ordered by the corresponding m/z value in the {#mz} Array
|
63
|
+
attr_reader :intensity
|
64
|
+
|
65
|
+
# All other attributes from DAT query sections not covered above
|
66
|
+
attr_reader :attributes
|
67
|
+
|
68
|
+
def initialize(query_str)
|
69
|
+
query_str.split(/\n/).each do |l|
|
70
|
+
next unless l =~ /(\w+)\=(.+)$/
|
71
|
+
k,v = $1,$2
|
72
|
+
case k
|
73
|
+
when "name"
|
74
|
+
@name = v.gsub('"','')
|
75
|
+
when "title"
|
76
|
+
@title = URI.decode(v)
|
77
|
+
when "index"
|
78
|
+
@index = v.to_i
|
79
|
+
when "rtinseconds"
|
80
|
+
@rtinseconds = v.to_i
|
81
|
+
when "charge"
|
82
|
+
@charge = v
|
83
|
+
when "mass_min"
|
84
|
+
@mass_min = v.to_f
|
85
|
+
when "mass_max"
|
86
|
+
@mass_max = v.to_f
|
87
|
+
when "int_min"
|
88
|
+
@int_min = v.to_f
|
89
|
+
when "int_max"
|
90
|
+
@int_max = v.to_f
|
91
|
+
when "num_vals"
|
92
|
+
@num_vals = v.to_i
|
93
|
+
when "num_used1"
|
94
|
+
@num_used1 = v.to_i
|
95
|
+
when "Ions1"
|
96
|
+
parse_ions1(v)
|
97
|
+
else
|
98
|
+
@attributes[k.to_sym] = v
|
99
|
+
end
|
100
|
+
end
|
101
|
+
end
|
102
|
+
|
103
|
+
private
|
104
|
+
def parse_ions1(ions1)
|
105
|
+
@peaks = []
|
106
|
+
ions1.split(",").collect do |mzpair|
|
107
|
+
@peaks << mzpair.split(":").collect {|e| e.to_f}
|
108
|
+
end
|
109
|
+
# now sort the mz_tmp array as ascending m/z, and return the array
|
110
|
+
@peaks.sort!
|
111
|
+
# once sorted by increasing m/z, populate the individual arrays
|
112
|
+
@mz = @peaks.collect {|p| p[0]}
|
113
|
+
@intensity = @peaks.collect {|p| p[1]}
|
114
|
+
end
|
115
|
+
end
|
116
|
+
end
|
117
|
+
end
|
118
|
+
|
data/lib/mascot/dat/version.rb
CHANGED
data/lib/mascot/dat.rb
CHANGED
@@ -6,6 +6,7 @@ require 'mascot/dat/parameters'
|
|
6
6
|
require 'mascot/dat/peptides'
|
7
7
|
require 'mascot/dat/proteins'
|
8
8
|
require 'mascot/dat/psm'
|
9
|
+
require "mascot/dat/query"
|
9
10
|
require 'mascot/dat/search_databases'
|
10
11
|
require 'mascot/dat/summary'
|
11
12
|
require 'mascot/dat/version'
|
@@ -48,41 +49,15 @@ module Mascot
|
|
48
49
|
@dat_file.close
|
49
50
|
end
|
50
51
|
|
51
|
-
#
|
52
|
+
# Return a specific query spectrum from the DAT file
|
52
53
|
#
|
53
54
|
# @param n The query spectrum numerical index
|
54
|
-
# @return
|
55
|
+
# @return {Mascot::DAT::Query}
|
55
56
|
def query(n)
|
56
|
-
#
|
57
|
-
bytepos = @idx["query#{n}".to_sym]
|
58
|
-
@dat_file.pos = bytepos + @boundary_string.length
|
59
|
-
att_rx = /(\w+)\=(.+)/
|
60
|
-
q = {}
|
61
|
-
@dat_file.each do |l|
|
62
|
-
l.chomp
|
63
|
-
case l
|
64
|
-
when att_rx
|
65
|
-
k,v = $1,$2
|
66
|
-
case k
|
67
|
-
when "title"
|
68
|
-
q[k.to_sym] = URI.decode(v)
|
69
|
-
when "Ions1"
|
70
|
-
q[:peaks] = parse_mzi(v)
|
71
|
-
else
|
72
|
-
q[k.to_sym] = v
|
73
|
-
end
|
74
|
-
when @boundary
|
75
|
-
break
|
76
|
-
else
|
77
|
-
next
|
78
|
-
end
|
79
|
-
end
|
80
|
-
q
|
57
|
+
return Mascot::DAT::Query.new(self.read_section(:"query#{n}"))
|
81
58
|
end
|
82
|
-
|
83
59
|
alias_method :spectrum, :query
|
84
60
|
|
85
|
-
|
86
61
|
# Go to a section of the Mascot DAT file
|
87
62
|
def goto(key)
|
88
63
|
if @idx.has_key?(key.to_sym)
|
@@ -189,15 +164,6 @@ module Mascot
|
|
189
164
|
@dat_file.rewind
|
190
165
|
end
|
191
166
|
|
192
|
-
|
193
|
-
# Peaks are not ordered, so we must account for that.
|
194
|
-
def parse_mzi(ions_str)
|
195
|
-
mzi_tmp = []
|
196
|
-
ions_str.split(",").collect do |mzpair|
|
197
|
-
mzi_tmp << mzpair.split(":").collect {|e| e.to_f}
|
198
|
-
end
|
199
|
-
# now sort the mz_tmp array as ascending m/z, and return the array
|
200
|
-
mzi_tmp.sort
|
201
|
-
end
|
167
|
+
|
202
168
|
end
|
203
169
|
end
|
@@ -0,0 +1,65 @@
|
|
1
|
+
require 'test_mascot-dat-helper'
|
2
|
+
|
3
|
+
class TestMascotDatQuery < TestMascotDatHelper
|
4
|
+
def setup
|
5
|
+
super
|
6
|
+
@query = @dat.query(23)
|
7
|
+
end
|
8
|
+
def test_name
|
9
|
+
assert_equal("query23", @query.name)
|
10
|
+
end
|
11
|
+
def test_title
|
12
|
+
assert_equal("281.832701459371_513",@query.title)
|
13
|
+
end
|
14
|
+
def test_rtinseconds
|
15
|
+
assert_equal(513, @query.rtinseconds)
|
16
|
+
end
|
17
|
+
def test_index
|
18
|
+
assert_equal(30,@query.index)
|
19
|
+
end
|
20
|
+
def test_charge
|
21
|
+
assert_equal("3+",@query.charge)
|
22
|
+
end
|
23
|
+
def test_mass_min
|
24
|
+
assert_equal(59.044502, @query.mass_min)
|
25
|
+
end
|
26
|
+
def test_mass_max
|
27
|
+
assert_equal(730.399487,@query.mass_max)
|
28
|
+
end
|
29
|
+
def test_int_min
|
30
|
+
assert_equal(1.951e+05, @query.int_min)
|
31
|
+
end
|
32
|
+
def test_int_max
|
33
|
+
assert_equal(1.951e+05, @query.int_max)
|
34
|
+
end
|
35
|
+
def test_num_vals
|
36
|
+
assert_equal(33,@query.num_vals)
|
37
|
+
end
|
38
|
+
def test_num_used1
|
39
|
+
assert_equal(-1, @query.num_used1)
|
40
|
+
end
|
41
|
+
|
42
|
+
def test_peaks
|
43
|
+
expected_peaks = Marshal.load(File.read("test/fixtures/query23_peaks.dmp"))
|
44
|
+
assert_equal(expected_peaks,@query.peaks)
|
45
|
+
end
|
46
|
+
|
47
|
+
def test_mz_array
|
48
|
+
mz_expected = [59.044502, 76.396653, 88.063115, 92.727062, 111.734216,
|
49
|
+
114.091341, 122.082957, 138.586954, 160.757021, 167.097686, 171.105762,
|
50
|
+
175.118953, 182.620797, 190.112916, 206.443325, 223.795476, 227.175405,
|
51
|
+
240.631893, 244.138013, 256.155004, 276.166632, 284.665736, 309.16135,
|
52
|
+
333.188096, 335.189576, 364.234317, 365.703382, 480.256511, 511.302732,
|
53
|
+
568.324196, 617.315423, 669.371875, 730.399487]
|
54
|
+
assert_equal(mz_expected,@query.mz)
|
55
|
+
end
|
56
|
+
|
57
|
+
def test_intensity_array
|
58
|
+
intensity_expected = [195100.0, 195100.0, 195100.0, 195100.0, 195100.0,
|
59
|
+
195100.0, 195100.0, 195100.0, 195100.0, 195100.0, 195100.0, 195100.0,
|
60
|
+
195100.0, 195100.0, 195100.0, 195100.0, 195100.0, 195100.0, 195100.0,
|
61
|
+
195100.0, 195100.0, 195100.0, 195100.0, 195100.0, 195100.0, 195100.0,
|
62
|
+
195100.0, 195100.0, 195100.0, 195100.0, 195100.0, 195100.0, 195100.0]
|
63
|
+
assert_equal(intensity_expected,@query.intensity)
|
64
|
+
end
|
65
|
+
end
|
data/test/test_mascot-dat.rb
CHANGED
@@ -9,6 +9,10 @@ class TestMascotDat < TestMascotDatHelper
|
|
9
9
|
assert_equal(Regexp.new("--gc0p4Jq0M2Yt08jU534c0p"), @dat.boundary)
|
10
10
|
end
|
11
11
|
|
12
|
+
def test_dat_boundary_string
|
13
|
+
assert_equal("--gc0p4Jq0M2Yt08jU534c0p", @dat.boundary_string)
|
14
|
+
end
|
15
|
+
|
12
16
|
def test_dat_byteoffset_index_is_created
|
13
17
|
File.unlink(@dat.dat_file.path + ".idx") if File.exists?(@dat.dat_file.path + ".idx")
|
14
18
|
@dat = Mascot::DAT.open("test/fixtures/example.dat")
|
@@ -42,10 +46,4 @@ class TestMascotDat < TestMascotDatHelper
|
|
42
46
|
assert_equal(expected_section, @dat.read_section(:masses))
|
43
47
|
end
|
44
48
|
|
45
|
-
def test_peaks
|
46
|
-
expected_peaks = Marshal.load(File.read("test/fixtures/query23_peaks.dmp"))
|
47
|
-
query23 = @dat.query(23)
|
48
|
-
assert_equal(expected_peaks,query23[:peaks])
|
49
|
-
end
|
50
|
-
|
51
49
|
end
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: mascot-dat
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.2.0
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -9,11 +9,11 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date: 2012-07-
|
12
|
+
date: 2012-07-09 00:00:00.000000000 Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: rake
|
16
|
-
requirement: &
|
16
|
+
requirement: &70340237935320 !ruby/object:Gem::Requirement
|
17
17
|
none: false
|
18
18
|
requirements:
|
19
19
|
- - ! '>='
|
@@ -21,10 +21,10 @@ dependencies:
|
|
21
21
|
version: '0'
|
22
22
|
type: :development
|
23
23
|
prerelease: false
|
24
|
-
version_requirements: *
|
24
|
+
version_requirements: *70340237935320
|
25
25
|
- !ruby/object:Gem::Dependency
|
26
26
|
name: yard
|
27
|
-
requirement: &
|
27
|
+
requirement: &70340237934880 !ruby/object:Gem::Requirement
|
28
28
|
none: false
|
29
29
|
requirements:
|
30
30
|
- - ! '>='
|
@@ -32,7 +32,7 @@ dependencies:
|
|
32
32
|
version: '0'
|
33
33
|
type: :development
|
34
34
|
prerelease: false
|
35
|
-
version_requirements: *
|
35
|
+
version_requirements: *70340237934880
|
36
36
|
description: Mascot DAT file format parser
|
37
37
|
email:
|
38
38
|
- angel@upenn.edu
|
@@ -53,6 +53,7 @@ files:
|
|
53
53
|
- lib/mascot/dat/peptides.rb
|
54
54
|
- lib/mascot/dat/proteins.rb
|
55
55
|
- lib/mascot/dat/psm.rb
|
56
|
+
- lib/mascot/dat/query.rb
|
56
57
|
- lib/mascot/dat/search_databases.rb
|
57
58
|
- lib/mascot/dat/summary.rb
|
58
59
|
- lib/mascot/dat/version.rb
|
@@ -69,6 +70,7 @@ files:
|
|
69
70
|
- test/test_mascot-dat-parameters.rb
|
70
71
|
- test/test_mascot-dat-peptides.rb
|
71
72
|
- test/test_mascot-dat-proteins.rb
|
73
|
+
- test/test_mascot-dat-query.rb
|
72
74
|
- test/test_mascot-dat-search_databases.rb
|
73
75
|
- test/test_mascot-dat-summary.rb
|
74
76
|
- test/test_mascot-dat.rb
|
@@ -86,7 +88,7 @@ required_ruby_version: !ruby/object:Gem::Requirement
|
|
86
88
|
version: '0'
|
87
89
|
segments:
|
88
90
|
- 0
|
89
|
-
hash: -
|
91
|
+
hash: -3741896018441832167
|
90
92
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
91
93
|
none: false
|
92
94
|
requirements:
|
@@ -95,7 +97,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
95
97
|
version: '0'
|
96
98
|
segments:
|
97
99
|
- 0
|
98
|
-
hash: -
|
100
|
+
hash: -3741896018441832167
|
99
101
|
requirements: []
|
100
102
|
rubyforge_project:
|
101
103
|
rubygems_version: 1.8.11
|
@@ -115,6 +117,7 @@ test_files:
|
|
115
117
|
- test/test_mascot-dat-parameters.rb
|
116
118
|
- test/test_mascot-dat-peptides.rb
|
117
119
|
- test/test_mascot-dat-proteins.rb
|
120
|
+
- test/test_mascot-dat-query.rb
|
118
121
|
- test/test_mascot-dat-search_databases.rb
|
119
122
|
- test/test_mascot-dat-summary.rb
|
120
123
|
- test/test_mascot-dat.rb
|