mascot-dat 0.2.1 → 0.3.0
Sign up to get free protection for your applications and to get access to all the features.
- data/lib/mascot/dat.rb +18 -4
- data/lib/mascot/dat/peptides.rb +86 -95
- data/lib/mascot/dat/psm.rb +71 -55
- data/lib/mascot/dat/query.rb +1 -1
- data/lib/mascot/dat/version.rb +1 -1
- data/test/test_mascot-dat-peptides.rb +21 -0
- data/test/test_mascot-dat-query.rb +1 -1
- metadata +7 -7
data/lib/mascot/dat.rb
CHANGED
@@ -87,7 +87,7 @@ module Mascot
|
|
87
87
|
|
88
88
|
# Parse the enzyme information from the DAT file
|
89
89
|
#
|
90
|
-
# @return [
|
90
|
+
# @return [Array<Mascot::DAT::Enzyme>]
|
91
91
|
def enzyme
|
92
92
|
@enzyme ||= Mascot::DAT::Enzyme.new(self.read_section(:enzyme))
|
93
93
|
end
|
@@ -110,13 +110,27 @@ module Mascot
|
|
110
110
|
end
|
111
111
|
|
112
112
|
# Puts the IO cursor at the beginning of peptide result section. Returns an iterator/parser for PSM results
|
113
|
-
#
|
113
|
+
#
|
114
|
+
# @param cache_psm_index Whether to cache the positions of the PSMs. If you just want to iterate
|
115
|
+
# through PSMs, you do not need to cache the index.
|
116
|
+
# @return [Mascot::DAT::Peptides, NilClass]
|
114
117
|
def peptides(cache_psm_index=true)
|
115
|
-
Mascot::DAT::Peptides.new(self
|
118
|
+
Mascot::DAT::Peptides.new(self, :peptides, cache_psm_index)
|
116
119
|
end
|
117
120
|
|
121
|
+
# If the DAT file has a decoy section, puts the IO cursor at the beginning of decoy_peptide
|
122
|
+
# result section and returns an iterator/parser for the decoy PSM results.
|
123
|
+
# If no decoy section exists, it will return nil.
|
124
|
+
#
|
125
|
+
# @param cache_psm_index Whether to cache the positions of the PSMs. If you just want to iterate
|
126
|
+
# through PSMs, you do not need to cache the index.
|
127
|
+
# @return [Mascot::DAT::Peptides, NilClass]
|
118
128
|
def decoy_peptides(cache_psm_index=true)
|
119
|
-
|
129
|
+
if @idx.has_key? :decoy_peptides
|
130
|
+
Mascot::DAT::Peptides.new(self,:decoy_peptides, cache_psm_index)
|
131
|
+
else
|
132
|
+
nil
|
133
|
+
end
|
120
134
|
end
|
121
135
|
|
122
136
|
|
data/lib/mascot/dat/peptides.rb
CHANGED
@@ -1,127 +1,118 @@
|
|
1
1
|
require 'csv'
|
2
2
|
module Mascot
|
3
3
|
class DAT
|
4
|
-
# A
|
4
|
+
# A iterator for the peptide spectrum match results of a Mascot DAT file.
|
5
5
|
# As opposed to the other sections of a DAT file, you don't really want to
|
6
|
-
# access this section
|
7
|
-
# needs to be accessed using Enumerable methods.
|
8
|
-
#
|
9
|
-
# From the Mascot documentation, the following represents a reasonably complete PSM
|
10
|
-
# q1_p1_db=01 # two digit integer of the search DB index, zero filled and retarded.
|
11
|
-
# q1_p1=missed cleavages, (–1 indicates no match)
|
12
|
-
# peptide Mr,
|
13
|
-
# delta,
|
14
|
-
# number of ions matched,
|
15
|
-
# peptide string,
|
16
|
-
# peaks used from Ions1,
|
17
|
-
# variable modifications string,
|
18
|
-
# ions score,
|
19
|
-
# ion series found,
|
20
|
-
# peaks used from Ions2,
|
21
|
-
# peaks used from Ions3;
|
22
|
-
# “accession string”:frame number:start:end:multiplicity, # data for first protein
|
23
|
-
# “accession string”:frame number:start:end:multiplicity, # data for second protein, etc.
|
24
|
-
# q1_p1_et_mods=modification mass,
|
25
|
-
# neutral loss mass,
|
26
|
-
# modification description
|
27
|
-
# q1_p1_primary_nl=neutral loss string
|
28
|
-
# q1_p1_drange=startPos:endPos
|
29
|
-
# q1_p1_terms=residue,residue:residue,residue # flanking AA for each protien, in order
|
30
|
-
#
|
6
|
+
# access this section in memory at once. It is often quite large and
|
7
|
+
# needs to be accessed using the provided Enumerable or random access methods.
|
31
8
|
class Peptides
|
32
9
|
include Enumerable
|
33
|
-
# A
|
34
|
-
#
|
35
|
-
|
10
|
+
# A nested Hash index of the byte offset positions for the peptide-spectrum-match entries.
|
11
|
+
# The keys of the index are the query and peptide rank (Fixnum), the structure of which is:
|
12
|
+
# { query_number => { peptide_rank => byte_position } }
|
13
|
+
# To access a particular entry, it is better to use the {#psm} method.
|
14
|
+
# @return [Hash{ Fixnum => Hash{ Fixnum => Fixnum }}] The nested hash of query peptide match byte offsets
|
15
|
+
attr_reader :psmidx
|
36
16
|
|
37
|
-
#
|
38
|
-
#
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
@
|
44
|
-
|
45
|
-
|
17
|
+
# @param dat [Mascot::DAT] Source DAT file
|
18
|
+
# @param section_label [Symbol] Section header, one of :peptides or :decoy_peptides
|
19
|
+
# @param cache_psm_index [Boolean] Whether to cache the PSM index
|
20
|
+
def initialize(dat, section_label, cache_psm_index=true)
|
21
|
+
# create our own filehandle, since other operations may interfere with the
|
22
|
+
@dat = Mascot::DAT.open(dat.dat_file.path)
|
23
|
+
@filehandle = @dat.dat_file
|
24
|
+
@section_label = section_label
|
25
|
+
self.rewind
|
46
26
|
@curr_psm = [1,1]
|
47
|
-
@psmidx =
|
48
|
-
@
|
49
|
-
|
50
|
-
|
51
|
-
|
52
|
-
def index_psm_positions
|
53
|
-
# create an in-memroy index of PSM byteoffsets
|
54
|
-
q,p = 0
|
55
|
-
@boundary_line = @file.readline
|
56
|
-
@boundary = Regexp.new(@boundary_line)
|
57
|
-
@file.each do |line|
|
58
|
-
break if line =~ @boundary
|
59
|
-
if @cache_psm_index
|
60
|
-
line =~ /q(\d+)_p(\d+)/
|
61
|
-
i,j = $1.to_i, $2.to_i
|
62
|
-
next if q == i && p == j
|
63
|
-
unless @psmidx[i].kind_of? Array
|
64
|
-
q = i
|
65
|
-
@psmidx[q] = []
|
66
|
-
end
|
67
|
-
@psmidx[i][j] = @file.pos - line.length
|
68
|
-
q,p = i,j
|
69
|
-
end
|
27
|
+
@psmidx = {}
|
28
|
+
@endbytepos = Float::INFINITY
|
29
|
+
if cache_psm_index
|
30
|
+
index_psm_positions()
|
70
31
|
end
|
71
|
-
@endbytepos = @file.pos - @boundary_line.length
|
72
|
-
rewind
|
73
32
|
end
|
74
33
|
|
34
|
+
# Rewind the cursor to the start of the peptides section (e.g. q1_p1=...)
|
75
35
|
def rewind
|
76
|
-
@
|
36
|
+
@dat.goto(@section_label)
|
37
|
+
1.upto(2) { @filehandle.readline }
|
77
38
|
end
|
78
39
|
|
79
40
|
# Return a specific {Mascot::DAT::PSM} identified for query <code>q</code> and peptide number <code>p</code>
|
80
|
-
# @param
|
81
|
-
# @param
|
82
|
-
# @return Mascot::DAT::PSM
|
83
|
-
|
84
|
-
|
85
|
-
|
41
|
+
# @param query_number [Fixnum]
|
42
|
+
# @param rank [Fixnum]
|
43
|
+
# @return [Mascot::DAT::PSM]
|
44
|
+
# @raise [Exception] if given an invalid q,p coordinate
|
45
|
+
# @example my_dat.peptides.psm(1,1) # => Mascot::DAT::PSM for query 1 peptide 1
|
46
|
+
def psm query_number,rank
|
47
|
+
if @psmidx[query_number] and @psmidx[query_number][rank]
|
48
|
+
@filehandle.pos = @psmidx[query_number][rank]
|
49
|
+
next_psm
|
50
|
+
else
|
51
|
+
raise Exception.new "Invalid PSM specification (#{q},#{p})"
|
52
|
+
end
|
86
53
|
end
|
87
54
|
|
88
55
|
# Returns the next {Mascot::DAT::PSM} from the DAT file. If there is no other PSM, then it returns nil.
|
89
|
-
# @return Mascot::DAT::PSM
|
56
|
+
# @return [Mascot::DAT::PSM, NilClass]
|
90
57
|
def next_psm
|
91
|
-
|
58
|
+
if @filehandle.pos >= @endbytepos
|
59
|
+
return nil
|
60
|
+
end
|
92
61
|
# get the initial values for query & rank
|
93
|
-
|
94
|
-
|
95
|
-
|
96
|
-
|
97
|
-
|
98
|
-
|
99
|
-
|
100
|
-
|
101
|
-
|
102
|
-
|
103
|
-
|
104
|
-
|
105
|
-
break if
|
62
|
+
buffer = [@filehandle.readline.chomp]
|
63
|
+
buffer[0] =~ /q(\d+)_p(\d+)/
|
64
|
+
q,p = $1, $2
|
65
|
+
@curr_psm = [q,p]
|
66
|
+
prev_pos = @filehandle.pos
|
67
|
+
@filehandle.each do |l|
|
68
|
+
l.chomp!
|
69
|
+
# break if we have reached the boundary
|
70
|
+
if l =~ @boundary
|
71
|
+
@endbytepos = @filehandle.pos - @dat.boundary_string.length
|
72
|
+
break
|
73
|
+
end
|
74
|
+
# break if we are on another PSM
|
106
75
|
break unless l =~ /^q#{q}_p#{p}/
|
107
|
-
|
108
|
-
|
76
|
+
buffer << l
|
77
|
+
prev_pos = @filehandle.pos
|
109
78
|
end
|
110
|
-
|
111
|
-
|
112
|
-
|
79
|
+
# rewind the cursor to the last hit
|
80
|
+
@filehandle.pos = prev_pos
|
81
|
+
# return the new PSM
|
82
|
+
Mascot::DAT::PSM.new(buffer)
|
113
83
|
end
|
114
84
|
|
115
85
|
# Iterate through all of the {Mascot::DAT::PSM} entries in the DAT file.
|
116
|
-
# @
|
86
|
+
# @yield [Mascot::DAT::PSM]
|
117
87
|
def each
|
118
|
-
|
119
|
-
while
|
120
|
-
psm = next_psm()
|
121
|
-
next if psm.nil?
|
88
|
+
self.rewind
|
89
|
+
while psm = self.next_psm
|
122
90
|
yield psm
|
123
91
|
end
|
124
92
|
end
|
93
|
+
|
94
|
+
private
|
95
|
+
# Index the byte offsets of the PSMs
|
96
|
+
# @private
|
97
|
+
def index_psm_positions
|
98
|
+
# create an in-memroy index of PSM byteoffsets
|
99
|
+
q,p = 0,0
|
100
|
+
# move the cursor past the boundary line
|
101
|
+
@filehandle.readline
|
102
|
+
@filehandle.each do |line|
|
103
|
+
break if line =~ @dat.boundary
|
104
|
+
line =~ /q(\d+)_p(\d+)/
|
105
|
+
qq,pp= $1.to_i, $2.to_i
|
106
|
+
next if q == qq && p == pp
|
107
|
+
q,p = qq,pp
|
108
|
+
unless @psmidx.has_key?(q)
|
109
|
+
@psmidx[q] = {}
|
110
|
+
end
|
111
|
+
@psmidx[q][p] = @filehandle.pos - line.length
|
112
|
+
end
|
113
|
+
@endbytepos = @filehandle.pos - @dat.boundary_string.length
|
114
|
+
self.rewind
|
115
|
+
end
|
125
116
|
end
|
126
117
|
end
|
127
118
|
end
|
data/lib/mascot/dat/psm.rb
CHANGED
@@ -1,81 +1,97 @@
|
|
1
1
|
module Mascot
|
2
2
|
class DAT
|
3
|
+
# A single Peptide Spectrum Match (PSM) result. In Mascot parlance, this is a
|
4
|
+
# match from a query (e.g. a single MS2 spectrum from a MGF file) to a given peptide. A query may match more than one
|
5
|
+
# peptide at a given score, and Mascot will report these in order of descending significance, or "rank".
|
6
|
+
#
|
7
|
+
# From the Mascot documentation, the following represents a reasonably complete PSM entry
|
8
|
+
# q1_p1_db=01 # two digit integer of the search DB index, zero filled and retarded.
|
9
|
+
# q1_p1=missed cleavages, (-1 indicates no match)
|
10
|
+
# peptide Mr,
|
11
|
+
# delta,
|
12
|
+
# number of ions matched,
|
13
|
+
# peptide string,
|
14
|
+
# peaks used from Ions1,
|
15
|
+
# variable modifications string,
|
16
|
+
# ions score,
|
17
|
+
# ion series found,
|
18
|
+
# peaks used from Ions2,
|
19
|
+
# peaks used from Ions3;
|
20
|
+
# "accession string":frame number:start:end:multiplicity, # data for first protein
|
21
|
+
# "accession string":frame number:start:end:multiplicity, # data for second protein, etc.
|
22
|
+
# q1_p1_et_mods=modification mass,
|
23
|
+
# neutral loss mass,
|
24
|
+
# modification description
|
25
|
+
# q1_p1_primary_nl=neutral loss string
|
26
|
+
# q1_p1_drange=startPos:endPos
|
27
|
+
# q1_p1_terms=residue,residue:residue,residue # flanking AA for each protien, in order
|
28
|
+
#
|
3
29
|
class PSM
|
4
30
|
|
5
|
-
|
6
|
-
|
7
|
-
|
31
|
+
attr_accessor :query
|
32
|
+
attr_accessor :rank
|
33
|
+
attr_accessor :missed_cleavages
|
34
|
+
attr_accessor :mr
|
35
|
+
attr_accessor :delta
|
36
|
+
attr_accessor :num_ions_matched
|
37
|
+
attr_accessor :pep
|
38
|
+
attr_accessor :ions1
|
39
|
+
attr_accessor :var_mods_str
|
40
|
+
attr_accessor :score
|
41
|
+
attr_accessor :ion_series_str
|
42
|
+
attr_accessor :ions2
|
43
|
+
attr_accessor :ions3
|
44
|
+
attr_accessor :proteins
|
45
|
+
attr_accessor :dbs
|
46
|
+
attr_accessor :terms
|
47
|
+
attr_accessor :attrs
|
8
48
|
|
9
|
-
|
10
|
-
|
49
|
+
# @param psm_entry [Array] The multi-line string entry from the Mascot DAT file
|
50
|
+
# @return [Mascot::DAT::PSM]
|
51
|
+
def initialize(psm_entry)
|
52
|
+
parse_entry(psm_entry)
|
11
53
|
end
|
12
54
|
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
end
|
21
|
-
end
|
22
|
-
end
|
23
|
-
end
|
24
|
-
|
25
|
-
def ==(other)
|
26
|
-
is_eql = true
|
27
|
-
ATTRS.each do |a|
|
28
|
-
if self.send(a) != other.send(a)
|
29
|
-
is_eql = false
|
30
|
-
break
|
31
|
-
end
|
32
|
-
end
|
33
|
-
is_eql
|
34
|
-
end
|
35
|
-
def self.parse psm_arr
|
36
|
-
psm_result = self.new()
|
37
|
-
psm_arr.each do |l|
|
38
|
-
next unless l =~ /^q/
|
39
|
-
|
55
|
+
private
|
56
|
+
# Parses the query entry multi-line string from the Mascot DAT file
|
57
|
+
# @private
|
58
|
+
# @param psm_entry [Array]
|
59
|
+
# @return [Mascot::DAT::PSM]
|
60
|
+
def parse_entry psm_entry
|
61
|
+
psm_entry.each do |l|
|
40
62
|
k,v = l.split "="
|
41
63
|
case k
|
42
64
|
when /^q(\d+)_p(\d+)$/
|
43
|
-
|
44
|
-
|
65
|
+
@query = $1.to_i
|
66
|
+
@rank = $2.to_i
|
45
67
|
psm_vals, prots = v.split(";")
|
46
68
|
psm_vals = psm_vals.split(',')
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
|
51
|
-
|
52
|
-
|
53
|
-
|
54
|
-
|
55
|
-
|
56
|
-
|
57
|
-
|
69
|
+
@missed_cleavages= psm_vals[0].to_i
|
70
|
+
@mr = psm_vals[1].to_f
|
71
|
+
@delta = psm_vals[2].to_f
|
72
|
+
@num_ions_matched = psm_vals[3].to_i
|
73
|
+
@pep = psm_vals[4]
|
74
|
+
@ions1 = psm_vals[5].to_i
|
75
|
+
@var_mods_str = psm_vals[6]
|
76
|
+
@score = psm_vals[7].to_f
|
77
|
+
@ion_series_str = psm_vals[8]
|
78
|
+
@ions2 = psm_vals[9].to_i
|
79
|
+
@ions3 = psm_vals[10].to_i
|
58
80
|
|
59
|
-
# assign
|
60
|
-
|
81
|
+
# assign protein s
|
82
|
+
@proteins = prots.split(",").map do |pe|
|
61
83
|
acc,*other_vals = pe.split(":")
|
62
84
|
acc.gsub!(/\"/,'')
|
63
85
|
[acc] + other_vals.map {|e| e.to_i }
|
64
86
|
end
|
65
87
|
when /db$/
|
66
88
|
# split on 2 chars, call to_i
|
67
|
-
|
89
|
+
@dbs = v.split(/(\d{2})/).grep(/^\d+$/).collect { |e| e.to_i }
|
68
90
|
when /terms$/
|
69
91
|
# for each protein, I have to add the term AA
|
70
|
-
|
71
|
-
else
|
72
|
-
# returns the smaller key
|
73
|
-
puts "****#{k}***"
|
74
|
-
k_sym = k.slice(/q\d+_p\d+_?(.+)/,1).to_sym
|
75
|
-
psm_result.attrs[k_sym] = v
|
92
|
+
@terms = v.split(":").collect {|t| t.split(",") }
|
76
93
|
end
|
77
94
|
end
|
78
|
-
psm_result
|
79
95
|
end
|
80
96
|
end
|
81
97
|
end
|
data/lib/mascot/dat/query.rb
CHANGED
data/lib/mascot/dat/version.rb
CHANGED
@@ -52,4 +52,25 @@ class TestMascotDatPeptides < TestMascotDatHelper
|
|
52
52
|
assert_equal([2], q1p1_psm.dbs)
|
53
53
|
assert_equal([["R","Y"]], q1p1_psm.terms)
|
54
54
|
end
|
55
|
+
|
56
|
+
def test_second_psm_from_next_psm
|
57
|
+
q1p1_psm = @peptides.psm(1,1)
|
58
|
+
q1p2_psm = @peptides.next_psm()
|
59
|
+
assert_equal(1, q1p2_psm.query)
|
60
|
+
assert_equal(2, q1p2_psm.rank)
|
61
|
+
assert_equal(0, q1p2_psm.missed_cleavages)
|
62
|
+
assert_equal(476.223068, q1p2_psm.mr)
|
63
|
+
assert_equal(-0.940226, q1p2_psm.delta)
|
64
|
+
assert_equal(4, q1p2_psm.num_ions_matched)
|
65
|
+
assert_equal("GGESK", q1p2_psm.pep)
|
66
|
+
assert_equal(9, q1p2_psm.ions1)
|
67
|
+
assert_equal("0000000", q1p2_psm.var_mods_str)
|
68
|
+
assert_equal(13.29, q1p2_psm.score)
|
69
|
+
assert_equal("0000002020000000000", q1p2_psm.ion_series_str)
|
70
|
+
assert_equal(0, q1p2_psm.ions2)
|
71
|
+
assert_equal(0, q1p2_psm.ions3)
|
72
|
+
assert_equal([["P70298", 0, 605, 609, 1]], q1p2_psm.proteins)
|
73
|
+
assert_equal([2], q1p2_psm.dbs)
|
74
|
+
assert_equal([["K","N"]], q1p2_psm.terms)
|
75
|
+
end
|
55
76
|
end
|
@@ -12,7 +12,7 @@ class TestMascotDatQuery < TestMascotDatHelper
|
|
12
12
|
assert_equal("281.832701459371_513",@query.title)
|
13
13
|
end
|
14
14
|
def test_rtinseconds
|
15
|
-
assert_equal(513
|
15
|
+
assert_equal(513, @query.rtinseconds)
|
16
16
|
end
|
17
17
|
def test_index
|
18
18
|
assert_equal(30,@query.index)
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: mascot-dat
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.3.0
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -13,7 +13,7 @@ date: 2012-07-10 00:00:00.000000000 Z
|
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: rake
|
16
|
-
requirement: &
|
16
|
+
requirement: &70273728840180 !ruby/object:Gem::Requirement
|
17
17
|
none: false
|
18
18
|
requirements:
|
19
19
|
- - ! '>='
|
@@ -21,10 +21,10 @@ dependencies:
|
|
21
21
|
version: '0'
|
22
22
|
type: :development
|
23
23
|
prerelease: false
|
24
|
-
version_requirements: *
|
24
|
+
version_requirements: *70273728840180
|
25
25
|
- !ruby/object:Gem::Dependency
|
26
26
|
name: yard
|
27
|
-
requirement: &
|
27
|
+
requirement: &70273728839140 !ruby/object:Gem::Requirement
|
28
28
|
none: false
|
29
29
|
requirements:
|
30
30
|
- - ! '>='
|
@@ -32,7 +32,7 @@ dependencies:
|
|
32
32
|
version: '0'
|
33
33
|
type: :development
|
34
34
|
prerelease: false
|
35
|
-
version_requirements: *
|
35
|
+
version_requirements: *70273728839140
|
36
36
|
description: Mascot DAT file format parser
|
37
37
|
email:
|
38
38
|
- angel@upenn.edu
|
@@ -88,7 +88,7 @@ required_ruby_version: !ruby/object:Gem::Requirement
|
|
88
88
|
version: '0'
|
89
89
|
segments:
|
90
90
|
- 0
|
91
|
-
hash:
|
91
|
+
hash: 3863437592712259051
|
92
92
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
93
93
|
none: false
|
94
94
|
requirements:
|
@@ -97,7 +97,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
97
97
|
version: '0'
|
98
98
|
segments:
|
99
99
|
- 0
|
100
|
-
hash:
|
100
|
+
hash: 3863437592712259051
|
101
101
|
requirements: []
|
102
102
|
rubyforge_project:
|
103
103
|
rubygems_version: 1.8.11
|