mascot-dat 0.2.1 → 0.3.0

Sign up to get free protection for your applications and to get access to all the features.
data/lib/mascot/dat.rb CHANGED
@@ -87,7 +87,7 @@ module Mascot
87
87
 
88
88
  # Parse the enzyme information from the DAT file
89
89
  #
90
- # @return [[Mascot::DAT::Enzyme]]
90
+ # @return [Array<Mascot::DAT::Enzyme>]
91
91
  def enzyme
92
92
  @enzyme ||= Mascot::DAT::Enzyme.new(self.read_section(:enzyme))
93
93
  end
@@ -110,13 +110,27 @@ module Mascot
110
110
  end
111
111
 
112
112
  # Puts the IO cursor at the beginning of peptide result section. Returns an iterator/parser for PSM results
113
- # @return [Mascot::DAT::Peptides]
113
+ #
114
+ # @param cache_psm_index Whether to cache the positions of the PSMs. If you just want to iterate
115
+ # through PSMs, you do not need to cache the index.
116
+ # @return [Mascot::DAT::Peptides, NilClass]
114
117
  def peptides(cache_psm_index=true)
115
- Mascot::DAT::Peptides.new(self.dat_file, self.idx[:peptides], cache_psm_index)
118
+ Mascot::DAT::Peptides.new(self, :peptides, cache_psm_index)
116
119
  end
117
120
 
121
+ # If the DAT file has a decoy section, puts the IO cursor at the beginning of decoy_peptide
122
+ # result section and returns an iterator/parser for the decoy PSM results.
123
+ # If no decoy section exists, it will return nil.
124
+ #
125
+ # @param cache_psm_index Whether to cache the positions of the PSMs. If you just want to iterate
126
+ # through PSMs, you do not need to cache the index.
127
+ # @return [Mascot::DAT::Peptides, NilClass]
118
128
  def decoy_peptides(cache_psm_index=true)
119
- Mascot::DAT::Peptides.new(self.dat_file, self.idx[:decoy_peptides], cache_psm_index)
129
+ if @idx.has_key? :decoy_peptides
130
+ Mascot::DAT::Peptides.new(self,:decoy_peptides, cache_psm_index)
131
+ else
132
+ nil
133
+ end
120
134
  end
121
135
 
122
136
 
@@ -1,127 +1,118 @@
1
1
  require 'csv'
2
2
  module Mascot
3
3
  class DAT
4
- # A parser for the peptide spectrum match results of a Mascot DAT file.
4
+ # A iterator for the peptide spectrum match results of a Mascot DAT file.
5
5
  # As opposed to the other sections of a DAT file, you don't really want to
6
- # access this section as one big chunk in memory. It is often quite large and
7
- # needs to be accessed using Enumerable methods.
8
- #
9
- # From the Mascot documentation, the following represents a reasonably complete PSM
10
- # q1_p1_db=01 # two digit integer of the search DB index, zero filled and retarded.
11
- # q1_p1=missed cleavages, (–1 indicates no match)
12
- # peptide Mr,
13
- # delta,
14
- # number of ions matched,
15
- # peptide string,
16
- # peaks used from Ions1,
17
- # variable modifications string,
18
- # ions score,
19
- # ion series found,
20
- # peaks used from Ions2,
21
- # peaks used from Ions3;
22
- # “accession string”:frame number:start:end:multiplicity, # data for first protein
23
- # “accession string”:frame number:start:end:multiplicity, # data for second protein, etc.
24
- # q1_p1_et_mods=modification mass,
25
- # neutral loss mass,
26
- # modification description
27
- # q1_p1_primary_nl=neutral loss string
28
- # q1_p1_drange=startPos:endPos
29
- # q1_p1_terms=residue,residue:residue,residue # flanking AA for each protien, in order
30
- #
6
+ # access this section in memory at once. It is often quite large and
7
+ # needs to be accessed using the provided Enumerable or random access methods.
31
8
  class Peptides
32
9
  include Enumerable
33
- # A hash of the index positions for the peptide PSM matches.
34
- # Keys arr
35
- attr_reader :psmidx, :byteoffset, :endbytepos
10
+ # A nested Hash index of the byte offset positions for the peptide-spectrum-match entries.
11
+ # The keys of the index are the query and peptide rank (Fixnum), the structure of which is:
12
+ # { query_number => { peptide_rank => byte_position } }
13
+ # To access a particular entry, it is better to use the {#psm} method.
14
+ # @return [Hash{ Fixnum => Hash{ Fixnum => Fixnum }}] The nested hash of query peptide match byte offsets
15
+ attr_reader :psmidx
36
16
 
37
- # To create a peptides enumerable, you need to pass in the dat file handle and
38
- # the byte offset of the peptides section.
39
- def initialize(dat_file, byteoffset, cache_psm_index=true)
40
- @byteoffset = byteoffset
41
- @endbytepos = nil
42
-
43
- @file = dat_file
44
-
45
- @file.pos = @byteoffset
17
+ # @param dat [Mascot::DAT] Source DAT file
18
+ # @param section_label [Symbol] Section header, one of :peptides or :decoy_peptides
19
+ # @param cache_psm_index [Boolean] Whether to cache the PSM index
20
+ def initialize(dat, section_label, cache_psm_index=true)
21
+ # create our own filehandle, since other operations may interfere with the
22
+ @dat = Mascot::DAT.open(dat.dat_file.path)
23
+ @filehandle = @dat.dat_file
24
+ @section_label = section_label
25
+ self.rewind
46
26
  @curr_psm = [1,1]
47
- @psmidx = []
48
- @cache_psm_index = cache_psm_index
49
- index_psm_positions()
50
- end
51
-
52
- def index_psm_positions
53
- # create an in-memroy index of PSM byteoffsets
54
- q,p = 0
55
- @boundary_line = @file.readline
56
- @boundary = Regexp.new(@boundary_line)
57
- @file.each do |line|
58
- break if line =~ @boundary
59
- if @cache_psm_index
60
- line =~ /q(\d+)_p(\d+)/
61
- i,j = $1.to_i, $2.to_i
62
- next if q == i && p == j
63
- unless @psmidx[i].kind_of? Array
64
- q = i
65
- @psmidx[q] = []
66
- end
67
- @psmidx[i][j] = @file.pos - line.length
68
- q,p = i,j
69
- end
27
+ @psmidx = {}
28
+ @endbytepos = Float::INFINITY
29
+ if cache_psm_index
30
+ index_psm_positions()
70
31
  end
71
- @endbytepos = @file.pos - @boundary_line.length
72
- rewind
73
32
  end
74
33
 
34
+ # Rewind the cursor to the start of the peptides section (e.g. q1_p1=...)
75
35
  def rewind
76
- @file.pos = @byteoffset + @boundary_line.length
36
+ @dat.goto(@section_label)
37
+ 1.upto(2) { @filehandle.readline }
77
38
  end
78
39
 
79
40
  # Return a specific {Mascot::DAT::PSM} identified for query <code>q</code> and peptide number <code>p</code>
80
- # @param q Fixnum
81
- # @param p Fixnum
82
- # @return Mascot::DAT::PSM
83
- def psm q,p
84
- @file.pos = @psmidx[q][p]
85
- next_psm
41
+ # @param query_number [Fixnum]
42
+ # @param rank [Fixnum]
43
+ # @return [Mascot::DAT::PSM]
44
+ # @raise [Exception] if given an invalid q,p coordinate
45
+ # @example my_dat.peptides.psm(1,1) # => Mascot::DAT::PSM for query 1 peptide 1
46
+ def psm query_number,rank
47
+ if @psmidx[query_number] and @psmidx[query_number][rank]
48
+ @filehandle.pos = @psmidx[query_number][rank]
49
+ next_psm
50
+ else
51
+ raise Exception.new "Invalid PSM specification (#{q},#{p})"
52
+ end
86
53
  end
87
54
 
88
55
  # Returns the next {Mascot::DAT::PSM} from the DAT file. If there is no other PSM, then it returns nil.
89
- # @return Mascot::DAT::PSM
56
+ # @return [Mascot::DAT::PSM, NilClass]
90
57
  def next_psm
91
- return nil if @file.pos >= @endbytepos
58
+ if @filehandle.pos >= @endbytepos
59
+ return nil
60
+ end
92
61
  # get the initial values for query & rank
93
- tmp = []
94
- tmp << @file.readline.chomp
95
- k,v = tmp[0].split "="
96
- # skip when there are no peptides (value equals -1)
97
- return nil if v == "-1"
98
-
99
- tmp[0] =~ /q(\d+)_p(\d+)/
100
- q = $1
101
- p = $2
102
-
103
- tmp_pos = @file.pos
104
- @file.each do |l|
105
- break if l =~ @boundary
62
+ buffer = [@filehandle.readline.chomp]
63
+ buffer[0] =~ /q(\d+)_p(\d+)/
64
+ q,p = $1, $2
65
+ @curr_psm = [q,p]
66
+ prev_pos = @filehandle.pos
67
+ @filehandle.each do |l|
68
+ l.chomp!
69
+ # break if we have reached the boundary
70
+ if l =~ @boundary
71
+ @endbytepos = @filehandle.pos - @dat.boundary_string.length
72
+ break
73
+ end
74
+ # break if we are on another PSM
106
75
  break unless l =~ /^q#{q}_p#{p}/
107
- tmp << l.chomp
108
- tmp_pos = @file.pos
76
+ buffer << l
77
+ prev_pos = @filehandle.pos
109
78
  end
110
- @file.pos = tmp_pos
111
-
112
- Mascot::DAT::PSM.parse(tmp)
79
+ # rewind the cursor to the last hit
80
+ @filehandle.pos = prev_pos
81
+ # return the new PSM
82
+ Mascot::DAT::PSM.new(buffer)
113
83
  end
114
84
 
115
85
  # Iterate through all of the {Mascot::DAT::PSM} entries in the DAT file.
116
- # @return Enumerator
86
+ # @yield [Mascot::DAT::PSM]
117
87
  def each
118
- @file.pos = @byteoffset
119
- while @file.pos < @endbytepos
120
- psm = next_psm()
121
- next if psm.nil?
88
+ self.rewind
89
+ while psm = self.next_psm
122
90
  yield psm
123
91
  end
124
92
  end
93
+
94
+ private
95
+ # Index the byte offsets of the PSMs
96
+ # @private
97
+ def index_psm_positions
98
+ # create an in-memroy index of PSM byteoffsets
99
+ q,p = 0,0
100
+ # move the cursor past the boundary line
101
+ @filehandle.readline
102
+ @filehandle.each do |line|
103
+ break if line =~ @dat.boundary
104
+ line =~ /q(\d+)_p(\d+)/
105
+ qq,pp= $1.to_i, $2.to_i
106
+ next if q == qq && p == pp
107
+ q,p = qq,pp
108
+ unless @psmidx.has_key?(q)
109
+ @psmidx[q] = {}
110
+ end
111
+ @psmidx[q][p] = @filehandle.pos - line.length
112
+ end
113
+ @endbytepos = @filehandle.pos - @dat.boundary_string.length
114
+ self.rewind
115
+ end
125
116
  end
126
117
  end
127
118
  end
@@ -1,81 +1,97 @@
1
1
  module Mascot
2
2
  class DAT
3
+ # A single Peptide Spectrum Match (PSM) result. In Mascot parlance, this is a
4
+ # match from a query (e.g. a single MS2 spectrum from a MGF file) to a given peptide. A query may match more than one
5
+ # peptide at a given score, and Mascot will report these in order of descending significance, or "rank".
6
+ #
7
+ # From the Mascot documentation, the following represents a reasonably complete PSM entry
8
+ # q1_p1_db=01 # two digit integer of the search DB index, zero filled and retarded.
9
+ # q1_p1=missed cleavages, (-1 indicates no match)
10
+ # peptide Mr,
11
+ # delta,
12
+ # number of ions matched,
13
+ # peptide string,
14
+ # peaks used from Ions1,
15
+ # variable modifications string,
16
+ # ions score,
17
+ # ion series found,
18
+ # peaks used from Ions2,
19
+ # peaks used from Ions3;
20
+ # "accession string":frame number:start:end:multiplicity, # data for first protein
21
+ # "accession string":frame number:start:end:multiplicity, # data for second protein, etc.
22
+ # q1_p1_et_mods=modification mass,
23
+ # neutral loss mass,
24
+ # modification description
25
+ # q1_p1_primary_nl=neutral loss string
26
+ # q1_p1_drange=startPos:endPos
27
+ # q1_p1_terms=residue,residue:residue,residue # flanking AA for each protien, in order
28
+ #
3
29
  class PSM
4
30
 
5
- ATTRS = [:query,:rank,:missed_cleavages,:mr, :delta,
6
- :num_ions_matched,:pep,:ions1,:var_mods_str,:score,
7
- :ion_series_str,:ions2,:ions3,:proteins,:dbs,:terms,:attrs]
31
+ attr_accessor :query
32
+ attr_accessor :rank
33
+ attr_accessor :missed_cleavages
34
+ attr_accessor :mr
35
+ attr_accessor :delta
36
+ attr_accessor :num_ions_matched
37
+ attr_accessor :pep
38
+ attr_accessor :ions1
39
+ attr_accessor :var_mods_str
40
+ attr_accessor :score
41
+ attr_accessor :ion_series_str
42
+ attr_accessor :ions2
43
+ attr_accessor :ions3
44
+ attr_accessor :proteins
45
+ attr_accessor :dbs
46
+ attr_accessor :terms
47
+ attr_accessor :attrs
8
48
 
9
- ATTRS.each do |a|
10
- attr_accessor a
49
+ # @param psm_entry [Array] The multi-line string entry from the Mascot DAT file
50
+ # @return [Mascot::DAT::PSM]
51
+ def initialize(psm_entry)
52
+ parse_entry(psm_entry)
11
53
  end
12
54
 
13
- def initialize(*opts)
14
- @attrs = {}
15
-
16
- if opts.kind_of? Hash
17
- opts.keys.each do |k|
18
- if ATTRS.index(k.to_sym)
19
- eval "@#{k} = #{opts[k]}"
20
- end
21
- end
22
- end
23
- end
24
-
25
- def ==(other)
26
- is_eql = true
27
- ATTRS.each do |a|
28
- if self.send(a) != other.send(a)
29
- is_eql = false
30
- break
31
- end
32
- end
33
- is_eql
34
- end
35
- def self.parse psm_arr
36
- psm_result = self.new()
37
- psm_arr.each do |l|
38
- next unless l =~ /^q/
39
-
55
+ private
56
+ # Parses the query entry multi-line string from the Mascot DAT file
57
+ # @private
58
+ # @param psm_entry [Array]
59
+ # @return [Mascot::DAT::PSM]
60
+ def parse_entry psm_entry
61
+ psm_entry.each do |l|
40
62
  k,v = l.split "="
41
63
  case k
42
64
  when /^q(\d+)_p(\d+)$/
43
- psm_result.query = $1.to_i
44
- psm_result.rank = $2.to_i
65
+ @query = $1.to_i
66
+ @rank = $2.to_i
45
67
  psm_vals, prots = v.split(";")
46
68
  psm_vals = psm_vals.split(',')
47
- psm_result.missed_cleavages= psm_vals[0].to_i
48
- psm_result.mr = psm_vals[1].to_f
49
- psm_result.delta = psm_vals[2].to_f
50
- psm_result.num_ions_matched = psm_vals[3].to_i
51
- psm_result.pep = psm_vals[4]
52
- psm_result.ions1 = psm_vals[5].to_i
53
- psm_result.var_mods_str = psm_vals[6]
54
- psm_result.score = psm_vals[7].to_f
55
- psm_result.ion_series_str = psm_vals[8]
56
- psm_result.ions2 = psm_vals[9].to_i
57
- psm_result.ions3 = psm_vals[10].to_i
69
+ @missed_cleavages= psm_vals[0].to_i
70
+ @mr = psm_vals[1].to_f
71
+ @delta = psm_vals[2].to_f
72
+ @num_ions_matched = psm_vals[3].to_i
73
+ @pep = psm_vals[4]
74
+ @ions1 = psm_vals[5].to_i
75
+ @var_mods_str = psm_vals[6]
76
+ @score = psm_vals[7].to_f
77
+ @ion_series_str = psm_vals[8]
78
+ @ions2 = psm_vals[9].to_i
79
+ @ions3 = psm_vals[10].to_i
58
80
 
59
- # assign proteins
60
- psm_result.proteins = prots.split(",").map do |pe|
81
+ # assign protein s
82
+ @proteins = prots.split(",").map do |pe|
61
83
  acc,*other_vals = pe.split(":")
62
84
  acc.gsub!(/\"/,'')
63
85
  [acc] + other_vals.map {|e| e.to_i }
64
86
  end
65
87
  when /db$/
66
88
  # split on 2 chars, call to_i
67
- psm_result.dbs = v.split(/(\d{2})/).grep(/^\d+$/) { |e| e.to_i }
89
+ @dbs = v.split(/(\d{2})/).grep(/^\d+$/).collect { |e| e.to_i }
68
90
  when /terms$/
69
91
  # for each protein, I have to add the term AA
70
- psm_result.terms = v.split(":").collect {|t| t.split(",") }
71
- else
72
- # returns the smaller key
73
- puts "****#{k}***"
74
- k_sym = k.slice(/q\d+_p\d+_?(.+)/,1).to_sym
75
- psm_result.attrs[k_sym] = v
92
+ @terms = v.split(":").collect {|t| t.split(",") }
76
93
  end
77
94
  end
78
- psm_result
79
95
  end
80
96
  end
81
97
  end
@@ -77,7 +77,7 @@ module Mascot
77
77
  when "index"
78
78
  @index = v.to_i
79
79
  when "rtinseconds"
80
- @rtinseconds = v.to_f
80
+ @rtinseconds = v.to_i
81
81
  when "charge"
82
82
  @charge = v
83
83
  when "mass_min"
@@ -1,6 +1,6 @@
1
1
  module Mascot
2
2
  class DAT
3
- VERSION = "0.2.1"
3
+ VERSION = "0.3.0"
4
4
  end
5
5
  end
6
6
 
@@ -52,4 +52,25 @@ class TestMascotDatPeptides < TestMascotDatHelper
52
52
  assert_equal([2], q1p1_psm.dbs)
53
53
  assert_equal([["R","Y"]], q1p1_psm.terms)
54
54
  end
55
+
56
+ def test_second_psm_from_next_psm
57
+ q1p1_psm = @peptides.psm(1,1)
58
+ q1p2_psm = @peptides.next_psm()
59
+ assert_equal(1, q1p2_psm.query)
60
+ assert_equal(2, q1p2_psm.rank)
61
+ assert_equal(0, q1p2_psm.missed_cleavages)
62
+ assert_equal(476.223068, q1p2_psm.mr)
63
+ assert_equal(-0.940226, q1p2_psm.delta)
64
+ assert_equal(4, q1p2_psm.num_ions_matched)
65
+ assert_equal("GGESK", q1p2_psm.pep)
66
+ assert_equal(9, q1p2_psm.ions1)
67
+ assert_equal("0000000", q1p2_psm.var_mods_str)
68
+ assert_equal(13.29, q1p2_psm.score)
69
+ assert_equal("0000002020000000000", q1p2_psm.ion_series_str)
70
+ assert_equal(0, q1p2_psm.ions2)
71
+ assert_equal(0, q1p2_psm.ions3)
72
+ assert_equal([["P70298", 0, 605, 609, 1]], q1p2_psm.proteins)
73
+ assert_equal([2], q1p2_psm.dbs)
74
+ assert_equal([["K","N"]], q1p2_psm.terms)
75
+ end
55
76
  end
@@ -12,7 +12,7 @@ class TestMascotDatQuery < TestMascotDatHelper
12
12
  assert_equal("281.832701459371_513",@query.title)
13
13
  end
14
14
  def test_rtinseconds
15
- assert_equal(513.0, @query.rtinseconds)
15
+ assert_equal(513, @query.rtinseconds)
16
16
  end
17
17
  def test_index
18
18
  assert_equal(30,@query.index)
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: mascot-dat
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.2.1
4
+ version: 0.3.0
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors:
@@ -13,7 +13,7 @@ date: 2012-07-10 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: rake
16
- requirement: &70341777412840 !ruby/object:Gem::Requirement
16
+ requirement: &70273728840180 !ruby/object:Gem::Requirement
17
17
  none: false
18
18
  requirements:
19
19
  - - ! '>='
@@ -21,10 +21,10 @@ dependencies:
21
21
  version: '0'
22
22
  type: :development
23
23
  prerelease: false
24
- version_requirements: *70341777412840
24
+ version_requirements: *70273728840180
25
25
  - !ruby/object:Gem::Dependency
26
26
  name: yard
27
- requirement: &70341777410100 !ruby/object:Gem::Requirement
27
+ requirement: &70273728839140 !ruby/object:Gem::Requirement
28
28
  none: false
29
29
  requirements:
30
30
  - - ! '>='
@@ -32,7 +32,7 @@ dependencies:
32
32
  version: '0'
33
33
  type: :development
34
34
  prerelease: false
35
- version_requirements: *70341777410100
35
+ version_requirements: *70273728839140
36
36
  description: Mascot DAT file format parser
37
37
  email:
38
38
  - angel@upenn.edu
@@ -88,7 +88,7 @@ required_ruby_version: !ruby/object:Gem::Requirement
88
88
  version: '0'
89
89
  segments:
90
90
  - 0
91
- hash: 4478337334591189647
91
+ hash: 3863437592712259051
92
92
  required_rubygems_version: !ruby/object:Gem::Requirement
93
93
  none: false
94
94
  requirements:
@@ -97,7 +97,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
97
97
  version: '0'
98
98
  segments:
99
99
  - 0
100
- hash: 4478337334591189647
100
+ hash: 3863437592712259051
101
101
  requirements: []
102
102
  rubyforge_project:
103
103
  rubygems_version: 1.8.11