mascot-dat 0.2.1 → 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/lib/mascot/dat.rb +18 -4
- data/lib/mascot/dat/peptides.rb +86 -95
- data/lib/mascot/dat/psm.rb +71 -55
- data/lib/mascot/dat/query.rb +1 -1
- data/lib/mascot/dat/version.rb +1 -1
- data/test/test_mascot-dat-peptides.rb +21 -0
- data/test/test_mascot-dat-query.rb +1 -1
- metadata +7 -7
data/lib/mascot/dat.rb
CHANGED
@@ -87,7 +87,7 @@ module Mascot
|
|
87
87
|
|
88
88
|
# Parse the enzyme information from the DAT file
|
89
89
|
#
|
90
|
-
# @return [
|
90
|
+
# @return [Array<Mascot::DAT::Enzyme>]
|
91
91
|
def enzyme
|
92
92
|
@enzyme ||= Mascot::DAT::Enzyme.new(self.read_section(:enzyme))
|
93
93
|
end
|
@@ -110,13 +110,27 @@ module Mascot
|
|
110
110
|
end
|
111
111
|
|
112
112
|
# Puts the IO cursor at the beginning of peptide result section. Returns an iterator/parser for PSM results
|
113
|
-
#
|
113
|
+
#
|
114
|
+
# @param cache_psm_index Whether to cache the positions of the PSMs. If you just want to iterate
|
115
|
+
# through PSMs, you do not need to cache the index.
|
116
|
+
# @return [Mascot::DAT::Peptides, NilClass]
|
114
117
|
def peptides(cache_psm_index=true)
|
115
|
-
Mascot::DAT::Peptides.new(self
|
118
|
+
Mascot::DAT::Peptides.new(self, :peptides, cache_psm_index)
|
116
119
|
end
|
117
120
|
|
121
|
+
# If the DAT file has a decoy section, puts the IO cursor at the beginning of decoy_peptide
|
122
|
+
# result section and returns an iterator/parser for the decoy PSM results.
|
123
|
+
# If no decoy section exists, it will return nil.
|
124
|
+
#
|
125
|
+
# @param cache_psm_index Whether to cache the positions of the PSMs. If you just want to iterate
|
126
|
+
# through PSMs, you do not need to cache the index.
|
127
|
+
# @return [Mascot::DAT::Peptides, NilClass]
|
118
128
|
def decoy_peptides(cache_psm_index=true)
|
119
|
-
|
129
|
+
if @idx.has_key? :decoy_peptides
|
130
|
+
Mascot::DAT::Peptides.new(self,:decoy_peptides, cache_psm_index)
|
131
|
+
else
|
132
|
+
nil
|
133
|
+
end
|
120
134
|
end
|
121
135
|
|
122
136
|
|
data/lib/mascot/dat/peptides.rb
CHANGED
@@ -1,127 +1,118 @@
|
|
1
1
|
require 'csv'
|
2
2
|
module Mascot
|
3
3
|
class DAT
|
4
|
-
# A
|
4
|
+
# A iterator for the peptide spectrum match results of a Mascot DAT file.
|
5
5
|
# As opposed to the other sections of a DAT file, you don't really want to
|
6
|
-
# access this section
|
7
|
-
# needs to be accessed using Enumerable methods.
|
8
|
-
#
|
9
|
-
# From the Mascot documentation, the following represents a reasonably complete PSM
|
10
|
-
# q1_p1_db=01 # two digit integer of the search DB index, zero filled and retarded.
|
11
|
-
# q1_p1=missed cleavages, (–1 indicates no match)
|
12
|
-
# peptide Mr,
|
13
|
-
# delta,
|
14
|
-
# number of ions matched,
|
15
|
-
# peptide string,
|
16
|
-
# peaks used from Ions1,
|
17
|
-
# variable modifications string,
|
18
|
-
# ions score,
|
19
|
-
# ion series found,
|
20
|
-
# peaks used from Ions2,
|
21
|
-
# peaks used from Ions3;
|
22
|
-
# “accession string”:frame number:start:end:multiplicity, # data for first protein
|
23
|
-
# “accession string”:frame number:start:end:multiplicity, # data for second protein, etc.
|
24
|
-
# q1_p1_et_mods=modification mass,
|
25
|
-
# neutral loss mass,
|
26
|
-
# modification description
|
27
|
-
# q1_p1_primary_nl=neutral loss string
|
28
|
-
# q1_p1_drange=startPos:endPos
|
29
|
-
# q1_p1_terms=residue,residue:residue,residue # flanking AA for each protien, in order
|
30
|
-
#
|
6
|
+
# access this section in memory at once. It is often quite large and
|
7
|
+
# needs to be accessed using the provided Enumerable or random access methods.
|
31
8
|
class Peptides
|
32
9
|
include Enumerable
|
33
|
-
# A
|
34
|
-
#
|
35
|
-
|
10
|
+
# A nested Hash index of the byte offset positions for the peptide-spectrum-match entries.
|
11
|
+
# The keys of the index are the query and peptide rank (Fixnum), the structure of which is:
|
12
|
+
# { query_number => { peptide_rank => byte_position } }
|
13
|
+
# To access a particular entry, it is better to use the {#psm} method.
|
14
|
+
# @return [Hash{ Fixnum => Hash{ Fixnum => Fixnum }}] The nested hash of query peptide match byte offsets
|
15
|
+
attr_reader :psmidx
|
36
16
|
|
37
|
-
#
|
38
|
-
#
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
@
|
44
|
-
|
45
|
-
|
17
|
+
# @param dat [Mascot::DAT] Source DAT file
|
18
|
+
# @param section_label [Symbol] Section header, one of :peptides or :decoy_peptides
|
19
|
+
# @param cache_psm_index [Boolean] Whether to cache the PSM index
|
20
|
+
def initialize(dat, section_label, cache_psm_index=true)
|
21
|
+
# create our own filehandle, since other operations may interfere with the
|
22
|
+
@dat = Mascot::DAT.open(dat.dat_file.path)
|
23
|
+
@filehandle = @dat.dat_file
|
24
|
+
@section_label = section_label
|
25
|
+
self.rewind
|
46
26
|
@curr_psm = [1,1]
|
47
|
-
@psmidx =
|
48
|
-
@
|
49
|
-
|
50
|
-
|
51
|
-
|
52
|
-
def index_psm_positions
|
53
|
-
# create an in-memroy index of PSM byteoffsets
|
54
|
-
q,p = 0
|
55
|
-
@boundary_line = @file.readline
|
56
|
-
@boundary = Regexp.new(@boundary_line)
|
57
|
-
@file.each do |line|
|
58
|
-
break if line =~ @boundary
|
59
|
-
if @cache_psm_index
|
60
|
-
line =~ /q(\d+)_p(\d+)/
|
61
|
-
i,j = $1.to_i, $2.to_i
|
62
|
-
next if q == i && p == j
|
63
|
-
unless @psmidx[i].kind_of? Array
|
64
|
-
q = i
|
65
|
-
@psmidx[q] = []
|
66
|
-
end
|
67
|
-
@psmidx[i][j] = @file.pos - line.length
|
68
|
-
q,p = i,j
|
69
|
-
end
|
27
|
+
@psmidx = {}
|
28
|
+
@endbytepos = Float::INFINITY
|
29
|
+
if cache_psm_index
|
30
|
+
index_psm_positions()
|
70
31
|
end
|
71
|
-
@endbytepos = @file.pos - @boundary_line.length
|
72
|
-
rewind
|
73
32
|
end
|
74
33
|
|
34
|
+
# Rewind the cursor to the start of the peptides section (e.g. q1_p1=...)
|
75
35
|
def rewind
|
76
|
-
@
|
36
|
+
@dat.goto(@section_label)
|
37
|
+
1.upto(2) { @filehandle.readline }
|
77
38
|
end
|
78
39
|
|
79
40
|
# Return a specific {Mascot::DAT::PSM} identified for query <code>q</code> and peptide number <code>p</code>
|
80
|
-
# @param
|
81
|
-
# @param
|
82
|
-
# @return Mascot::DAT::PSM
|
83
|
-
|
84
|
-
|
85
|
-
|
41
|
+
# @param query_number [Fixnum]
|
42
|
+
# @param rank [Fixnum]
|
43
|
+
# @return [Mascot::DAT::PSM]
|
44
|
+
# @raise [Exception] if given an invalid q,p coordinate
|
45
|
+
# @example my_dat.peptides.psm(1,1) # => Mascot::DAT::PSM for query 1 peptide 1
|
46
|
+
def psm query_number,rank
|
47
|
+
if @psmidx[query_number] and @psmidx[query_number][rank]
|
48
|
+
@filehandle.pos = @psmidx[query_number][rank]
|
49
|
+
next_psm
|
50
|
+
else
|
51
|
+
raise Exception.new "Invalid PSM specification (#{q},#{p})"
|
52
|
+
end
|
86
53
|
end
|
87
54
|
|
88
55
|
# Returns the next {Mascot::DAT::PSM} from the DAT file. If there is no other PSM, then it returns nil.
|
89
|
-
# @return Mascot::DAT::PSM
|
56
|
+
# @return [Mascot::DAT::PSM, NilClass]
|
90
57
|
def next_psm
|
91
|
-
|
58
|
+
if @filehandle.pos >= @endbytepos
|
59
|
+
return nil
|
60
|
+
end
|
92
61
|
# get the initial values for query & rank
|
93
|
-
|
94
|
-
|
95
|
-
|
96
|
-
|
97
|
-
|
98
|
-
|
99
|
-
|
100
|
-
|
101
|
-
|
102
|
-
|
103
|
-
|
104
|
-
|
105
|
-
break if
|
62
|
+
buffer = [@filehandle.readline.chomp]
|
63
|
+
buffer[0] =~ /q(\d+)_p(\d+)/
|
64
|
+
q,p = $1, $2
|
65
|
+
@curr_psm = [q,p]
|
66
|
+
prev_pos = @filehandle.pos
|
67
|
+
@filehandle.each do |l|
|
68
|
+
l.chomp!
|
69
|
+
# break if we have reached the boundary
|
70
|
+
if l =~ @boundary
|
71
|
+
@endbytepos = @filehandle.pos - @dat.boundary_string.length
|
72
|
+
break
|
73
|
+
end
|
74
|
+
# break if we are on another PSM
|
106
75
|
break unless l =~ /^q#{q}_p#{p}/
|
107
|
-
|
108
|
-
|
76
|
+
buffer << l
|
77
|
+
prev_pos = @filehandle.pos
|
109
78
|
end
|
110
|
-
|
111
|
-
|
112
|
-
|
79
|
+
# rewind the cursor to the last hit
|
80
|
+
@filehandle.pos = prev_pos
|
81
|
+
# return the new PSM
|
82
|
+
Mascot::DAT::PSM.new(buffer)
|
113
83
|
end
|
114
84
|
|
115
85
|
# Iterate through all of the {Mascot::DAT::PSM} entries in the DAT file.
|
116
|
-
# @
|
86
|
+
# @yield [Mascot::DAT::PSM]
|
117
87
|
def each
|
118
|
-
|
119
|
-
while
|
120
|
-
psm = next_psm()
|
121
|
-
next if psm.nil?
|
88
|
+
self.rewind
|
89
|
+
while psm = self.next_psm
|
122
90
|
yield psm
|
123
91
|
end
|
124
92
|
end
|
93
|
+
|
94
|
+
private
|
95
|
+
# Index the byte offsets of the PSMs
|
96
|
+
# @private
|
97
|
+
def index_psm_positions
|
98
|
+
# create an in-memroy index of PSM byteoffsets
|
99
|
+
q,p = 0,0
|
100
|
+
# move the cursor past the boundary line
|
101
|
+
@filehandle.readline
|
102
|
+
@filehandle.each do |line|
|
103
|
+
break if line =~ @dat.boundary
|
104
|
+
line =~ /q(\d+)_p(\d+)/
|
105
|
+
qq,pp= $1.to_i, $2.to_i
|
106
|
+
next if q == qq && p == pp
|
107
|
+
q,p = qq,pp
|
108
|
+
unless @psmidx.has_key?(q)
|
109
|
+
@psmidx[q] = {}
|
110
|
+
end
|
111
|
+
@psmidx[q][p] = @filehandle.pos - line.length
|
112
|
+
end
|
113
|
+
@endbytepos = @filehandle.pos - @dat.boundary_string.length
|
114
|
+
self.rewind
|
115
|
+
end
|
125
116
|
end
|
126
117
|
end
|
127
118
|
end
|
data/lib/mascot/dat/psm.rb
CHANGED
@@ -1,81 +1,97 @@
|
|
1
1
|
module Mascot
|
2
2
|
class DAT
|
3
|
+
# A single Peptide Spectrum Match (PSM) result. In Mascot parlance, this is a
|
4
|
+
# match from a query (e.g. a single MS2 spectrum from a MGF file) to a given peptide. A query may match more than one
|
5
|
+
# peptide at a given score, and Mascot will report these in order of descending significance, or "rank".
|
6
|
+
#
|
7
|
+
# From the Mascot documentation, the following represents a reasonably complete PSM entry
|
8
|
+
# q1_p1_db=01 # two digit integer of the search DB index, zero filled and retarded.
|
9
|
+
# q1_p1=missed cleavages, (-1 indicates no match)
|
10
|
+
# peptide Mr,
|
11
|
+
# delta,
|
12
|
+
# number of ions matched,
|
13
|
+
# peptide string,
|
14
|
+
# peaks used from Ions1,
|
15
|
+
# variable modifications string,
|
16
|
+
# ions score,
|
17
|
+
# ion series found,
|
18
|
+
# peaks used from Ions2,
|
19
|
+
# peaks used from Ions3;
|
20
|
+
# "accession string":frame number:start:end:multiplicity, # data for first protein
|
21
|
+
# "accession string":frame number:start:end:multiplicity, # data for second protein, etc.
|
22
|
+
# q1_p1_et_mods=modification mass,
|
23
|
+
# neutral loss mass,
|
24
|
+
# modification description
|
25
|
+
# q1_p1_primary_nl=neutral loss string
|
26
|
+
# q1_p1_drange=startPos:endPos
|
27
|
+
# q1_p1_terms=residue,residue:residue,residue # flanking AA for each protien, in order
|
28
|
+
#
|
3
29
|
class PSM
|
4
30
|
|
5
|
-
|
6
|
-
|
7
|
-
|
31
|
+
attr_accessor :query
|
32
|
+
attr_accessor :rank
|
33
|
+
attr_accessor :missed_cleavages
|
34
|
+
attr_accessor :mr
|
35
|
+
attr_accessor :delta
|
36
|
+
attr_accessor :num_ions_matched
|
37
|
+
attr_accessor :pep
|
38
|
+
attr_accessor :ions1
|
39
|
+
attr_accessor :var_mods_str
|
40
|
+
attr_accessor :score
|
41
|
+
attr_accessor :ion_series_str
|
42
|
+
attr_accessor :ions2
|
43
|
+
attr_accessor :ions3
|
44
|
+
attr_accessor :proteins
|
45
|
+
attr_accessor :dbs
|
46
|
+
attr_accessor :terms
|
47
|
+
attr_accessor :attrs
|
8
48
|
|
9
|
-
|
10
|
-
|
49
|
+
# @param psm_entry [Array] The multi-line string entry from the Mascot DAT file
|
50
|
+
# @return [Mascot::DAT::PSM]
|
51
|
+
def initialize(psm_entry)
|
52
|
+
parse_entry(psm_entry)
|
11
53
|
end
|
12
54
|
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
end
|
21
|
-
end
|
22
|
-
end
|
23
|
-
end
|
24
|
-
|
25
|
-
def ==(other)
|
26
|
-
is_eql = true
|
27
|
-
ATTRS.each do |a|
|
28
|
-
if self.send(a) != other.send(a)
|
29
|
-
is_eql = false
|
30
|
-
break
|
31
|
-
end
|
32
|
-
end
|
33
|
-
is_eql
|
34
|
-
end
|
35
|
-
def self.parse psm_arr
|
36
|
-
psm_result = self.new()
|
37
|
-
psm_arr.each do |l|
|
38
|
-
next unless l =~ /^q/
|
39
|
-
|
55
|
+
private
|
56
|
+
# Parses the query entry multi-line string from the Mascot DAT file
|
57
|
+
# @private
|
58
|
+
# @param psm_entry [Array]
|
59
|
+
# @return [Mascot::DAT::PSM]
|
60
|
+
def parse_entry psm_entry
|
61
|
+
psm_entry.each do |l|
|
40
62
|
k,v = l.split "="
|
41
63
|
case k
|
42
64
|
when /^q(\d+)_p(\d+)$/
|
43
|
-
|
44
|
-
|
65
|
+
@query = $1.to_i
|
66
|
+
@rank = $2.to_i
|
45
67
|
psm_vals, prots = v.split(";")
|
46
68
|
psm_vals = psm_vals.split(',')
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
|
51
|
-
|
52
|
-
|
53
|
-
|
54
|
-
|
55
|
-
|
56
|
-
|
57
|
-
|
69
|
+
@missed_cleavages= psm_vals[0].to_i
|
70
|
+
@mr = psm_vals[1].to_f
|
71
|
+
@delta = psm_vals[2].to_f
|
72
|
+
@num_ions_matched = psm_vals[3].to_i
|
73
|
+
@pep = psm_vals[4]
|
74
|
+
@ions1 = psm_vals[5].to_i
|
75
|
+
@var_mods_str = psm_vals[6]
|
76
|
+
@score = psm_vals[7].to_f
|
77
|
+
@ion_series_str = psm_vals[8]
|
78
|
+
@ions2 = psm_vals[9].to_i
|
79
|
+
@ions3 = psm_vals[10].to_i
|
58
80
|
|
59
|
-
# assign
|
60
|
-
|
81
|
+
# assign protein s
|
82
|
+
@proteins = prots.split(",").map do |pe|
|
61
83
|
acc,*other_vals = pe.split(":")
|
62
84
|
acc.gsub!(/\"/,'')
|
63
85
|
[acc] + other_vals.map {|e| e.to_i }
|
64
86
|
end
|
65
87
|
when /db$/
|
66
88
|
# split on 2 chars, call to_i
|
67
|
-
|
89
|
+
@dbs = v.split(/(\d{2})/).grep(/^\d+$/).collect { |e| e.to_i }
|
68
90
|
when /terms$/
|
69
91
|
# for each protein, I have to add the term AA
|
70
|
-
|
71
|
-
else
|
72
|
-
# returns the smaller key
|
73
|
-
puts "****#{k}***"
|
74
|
-
k_sym = k.slice(/q\d+_p\d+_?(.+)/,1).to_sym
|
75
|
-
psm_result.attrs[k_sym] = v
|
92
|
+
@terms = v.split(":").collect {|t| t.split(",") }
|
76
93
|
end
|
77
94
|
end
|
78
|
-
psm_result
|
79
95
|
end
|
80
96
|
end
|
81
97
|
end
|
data/lib/mascot/dat/query.rb
CHANGED
data/lib/mascot/dat/version.rb
CHANGED
@@ -52,4 +52,25 @@ class TestMascotDatPeptides < TestMascotDatHelper
|
|
52
52
|
assert_equal([2], q1p1_psm.dbs)
|
53
53
|
assert_equal([["R","Y"]], q1p1_psm.terms)
|
54
54
|
end
|
55
|
+
|
56
|
+
def test_second_psm_from_next_psm
|
57
|
+
q1p1_psm = @peptides.psm(1,1)
|
58
|
+
q1p2_psm = @peptides.next_psm()
|
59
|
+
assert_equal(1, q1p2_psm.query)
|
60
|
+
assert_equal(2, q1p2_psm.rank)
|
61
|
+
assert_equal(0, q1p2_psm.missed_cleavages)
|
62
|
+
assert_equal(476.223068, q1p2_psm.mr)
|
63
|
+
assert_equal(-0.940226, q1p2_psm.delta)
|
64
|
+
assert_equal(4, q1p2_psm.num_ions_matched)
|
65
|
+
assert_equal("GGESK", q1p2_psm.pep)
|
66
|
+
assert_equal(9, q1p2_psm.ions1)
|
67
|
+
assert_equal("0000000", q1p2_psm.var_mods_str)
|
68
|
+
assert_equal(13.29, q1p2_psm.score)
|
69
|
+
assert_equal("0000002020000000000", q1p2_psm.ion_series_str)
|
70
|
+
assert_equal(0, q1p2_psm.ions2)
|
71
|
+
assert_equal(0, q1p2_psm.ions3)
|
72
|
+
assert_equal([["P70298", 0, 605, 609, 1]], q1p2_psm.proteins)
|
73
|
+
assert_equal([2], q1p2_psm.dbs)
|
74
|
+
assert_equal([["K","N"]], q1p2_psm.terms)
|
75
|
+
end
|
55
76
|
end
|
@@ -12,7 +12,7 @@ class TestMascotDatQuery < TestMascotDatHelper
|
|
12
12
|
assert_equal("281.832701459371_513",@query.title)
|
13
13
|
end
|
14
14
|
def test_rtinseconds
|
15
|
-
assert_equal(513
|
15
|
+
assert_equal(513, @query.rtinseconds)
|
16
16
|
end
|
17
17
|
def test_index
|
18
18
|
assert_equal(30,@query.index)
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: mascot-dat
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.3.0
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -13,7 +13,7 @@ date: 2012-07-10 00:00:00.000000000 Z
|
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: rake
|
16
|
-
requirement: &
|
16
|
+
requirement: &70273728840180 !ruby/object:Gem::Requirement
|
17
17
|
none: false
|
18
18
|
requirements:
|
19
19
|
- - ! '>='
|
@@ -21,10 +21,10 @@ dependencies:
|
|
21
21
|
version: '0'
|
22
22
|
type: :development
|
23
23
|
prerelease: false
|
24
|
-
version_requirements: *
|
24
|
+
version_requirements: *70273728840180
|
25
25
|
- !ruby/object:Gem::Dependency
|
26
26
|
name: yard
|
27
|
-
requirement: &
|
27
|
+
requirement: &70273728839140 !ruby/object:Gem::Requirement
|
28
28
|
none: false
|
29
29
|
requirements:
|
30
30
|
- - ! '>='
|
@@ -32,7 +32,7 @@ dependencies:
|
|
32
32
|
version: '0'
|
33
33
|
type: :development
|
34
34
|
prerelease: false
|
35
|
-
version_requirements: *
|
35
|
+
version_requirements: *70273728839140
|
36
36
|
description: Mascot DAT file format parser
|
37
37
|
email:
|
38
38
|
- angel@upenn.edu
|
@@ -88,7 +88,7 @@ required_ruby_version: !ruby/object:Gem::Requirement
|
|
88
88
|
version: '0'
|
89
89
|
segments:
|
90
90
|
- 0
|
91
|
-
hash:
|
91
|
+
hash: 3863437592712259051
|
92
92
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
93
93
|
none: false
|
94
94
|
requirements:
|
@@ -97,7 +97,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
97
97
|
version: '0'
|
98
98
|
segments:
|
99
99
|
- 0
|
100
|
-
hash:
|
100
|
+
hash: 3863437592712259051
|
101
101
|
requirements: []
|
102
102
|
rubyforge_project:
|
103
103
|
rubygems_version: 1.8.11
|