mascot-dat 0.2.1 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/lib/mascot/dat.rb CHANGED
@@ -87,7 +87,7 @@ module Mascot
87
87
 
88
88
  # Parse the enzyme information from the DAT file
89
89
  #
90
- # @return [[Mascot::DAT::Enzyme]]
90
+ # @return [Array<Mascot::DAT::Enzyme>]
91
91
  def enzyme
92
92
  @enzyme ||= Mascot::DAT::Enzyme.new(self.read_section(:enzyme))
93
93
  end
@@ -110,13 +110,27 @@ module Mascot
110
110
  end
111
111
 
112
112
  # Puts the IO cursor at the beginning of peptide result section. Returns an iterator/parser for PSM results
113
- # @return [Mascot::DAT::Peptides]
113
+ #
114
+ # @param cache_psm_index Whether to cache the positions of the PSMs. If you just want to iterate
115
+ # through PSMs, you do not need to cache the index.
116
+ # @return [Mascot::DAT::Peptides, NilClass]
114
117
  def peptides(cache_psm_index=true)
115
- Mascot::DAT::Peptides.new(self.dat_file, self.idx[:peptides], cache_psm_index)
118
+ Mascot::DAT::Peptides.new(self, :peptides, cache_psm_index)
116
119
  end
117
120
 
121
+ # If the DAT file has a decoy section, puts the IO cursor at the beginning of decoy_peptide
122
+ # result section and returns an iterator/parser for the decoy PSM results.
123
+ # If no decoy section exists, it will return nil.
124
+ #
125
+ # @param cache_psm_index Whether to cache the positions of the PSMs. If you just want to iterate
126
+ # through PSMs, you do not need to cache the index.
127
+ # @return [Mascot::DAT::Peptides, NilClass]
118
128
  def decoy_peptides(cache_psm_index=true)
119
- Mascot::DAT::Peptides.new(self.dat_file, self.idx[:decoy_peptides], cache_psm_index)
129
+ if @idx.has_key? :decoy_peptides
130
+ Mascot::DAT::Peptides.new(self,:decoy_peptides, cache_psm_index)
131
+ else
132
+ nil
133
+ end
120
134
  end
121
135
 
122
136
 
@@ -1,127 +1,118 @@
1
1
  require 'csv'
2
2
  module Mascot
3
3
  class DAT
4
- # A parser for the peptide spectrum match results of a Mascot DAT file.
4
+ # A iterator for the peptide spectrum match results of a Mascot DAT file.
5
5
  # As opposed to the other sections of a DAT file, you don't really want to
6
- # access this section as one big chunk in memory. It is often quite large and
7
- # needs to be accessed using Enumerable methods.
8
- #
9
- # From the Mascot documentation, the following represents a reasonably complete PSM
10
- # q1_p1_db=01 # two digit integer of the search DB index, zero filled and retarded.
11
- # q1_p1=missed cleavages, (–1 indicates no match)
12
- # peptide Mr,
13
- # delta,
14
- # number of ions matched,
15
- # peptide string,
16
- # peaks used from Ions1,
17
- # variable modifications string,
18
- # ions score,
19
- # ion series found,
20
- # peaks used from Ions2,
21
- # peaks used from Ions3;
22
- # “accession string”:frame number:start:end:multiplicity, # data for first protein
23
- # “accession string”:frame number:start:end:multiplicity, # data for second protein, etc.
24
- # q1_p1_et_mods=modification mass,
25
- # neutral loss mass,
26
- # modification description
27
- # q1_p1_primary_nl=neutral loss string
28
- # q1_p1_drange=startPos:endPos
29
- # q1_p1_terms=residue,residue:residue,residue # flanking AA for each protien, in order
30
- #
6
+ # access this section in memory at once. It is often quite large and
7
+ # needs to be accessed using the provided Enumerable or random access methods.
31
8
  class Peptides
32
9
  include Enumerable
33
- # A hash of the index positions for the peptide PSM matches.
34
- # Keys arr
35
- attr_reader :psmidx, :byteoffset, :endbytepos
10
+ # A nested Hash index of the byte offset positions for the peptide-spectrum-match entries.
11
+ # The keys of the index are the query and peptide rank (Fixnum), the structure of which is:
12
+ # { query_number => { peptide_rank => byte_position } }
13
+ # To access a particular entry, it is better to use the {#psm} method.
14
+ # @return [Hash{ Fixnum => Hash{ Fixnum => Fixnum }}] The nested hash of query peptide match byte offsets
15
+ attr_reader :psmidx
36
16
 
37
- # To create a peptides enumerable, you need to pass in the dat file handle and
38
- # the byte offset of the peptides section.
39
- def initialize(dat_file, byteoffset, cache_psm_index=true)
40
- @byteoffset = byteoffset
41
- @endbytepos = nil
42
-
43
- @file = dat_file
44
-
45
- @file.pos = @byteoffset
17
+ # @param dat [Mascot::DAT] Source DAT file
18
+ # @param section_label [Symbol] Section header, one of :peptides or :decoy_peptides
19
+ # @param cache_psm_index [Boolean] Whether to cache the PSM index
20
+ def initialize(dat, section_label, cache_psm_index=true)
21
+ # create our own filehandle, since other operations may interfere with the
22
+ @dat = Mascot::DAT.open(dat.dat_file.path)
23
+ @filehandle = @dat.dat_file
24
+ @section_label = section_label
25
+ self.rewind
46
26
  @curr_psm = [1,1]
47
- @psmidx = []
48
- @cache_psm_index = cache_psm_index
49
- index_psm_positions()
50
- end
51
-
52
- def index_psm_positions
53
- # create an in-memroy index of PSM byteoffsets
54
- q,p = 0
55
- @boundary_line = @file.readline
56
- @boundary = Regexp.new(@boundary_line)
57
- @file.each do |line|
58
- break if line =~ @boundary
59
- if @cache_psm_index
60
- line =~ /q(\d+)_p(\d+)/
61
- i,j = $1.to_i, $2.to_i
62
- next if q == i && p == j
63
- unless @psmidx[i].kind_of? Array
64
- q = i
65
- @psmidx[q] = []
66
- end
67
- @psmidx[i][j] = @file.pos - line.length
68
- q,p = i,j
69
- end
27
+ @psmidx = {}
28
+ @endbytepos = Float::INFINITY
29
+ if cache_psm_index
30
+ index_psm_positions()
70
31
  end
71
- @endbytepos = @file.pos - @boundary_line.length
72
- rewind
73
32
  end
74
33
 
34
+ # Rewind the cursor to the start of the peptides section (e.g. q1_p1=...)
75
35
  def rewind
76
- @file.pos = @byteoffset + @boundary_line.length
36
+ @dat.goto(@section_label)
37
+ 1.upto(2) { @filehandle.readline }
77
38
  end
78
39
 
79
40
  # Return a specific {Mascot::DAT::PSM} identified for query <code>q</code> and peptide number <code>p</code>
80
- # @param q Fixnum
81
- # @param p Fixnum
82
- # @return Mascot::DAT::PSM
83
- def psm q,p
84
- @file.pos = @psmidx[q][p]
85
- next_psm
41
+ # @param query_number [Fixnum]
42
+ # @param rank [Fixnum]
43
+ # @return [Mascot::DAT::PSM]
44
+ # @raise [Exception] if given an invalid q,p coordinate
45
+ # @example my_dat.peptides.psm(1,1) # => Mascot::DAT::PSM for query 1 peptide 1
46
+ def psm query_number,rank
47
+ if @psmidx[query_number] and @psmidx[query_number][rank]
48
+ @filehandle.pos = @psmidx[query_number][rank]
49
+ next_psm
50
+ else
51
+ raise Exception.new "Invalid PSM specification (#{q},#{p})"
52
+ end
86
53
  end
87
54
 
88
55
  # Returns the next {Mascot::DAT::PSM} from the DAT file. If there is no other PSM, then it returns nil.
89
- # @return Mascot::DAT::PSM
56
+ # @return [Mascot::DAT::PSM, NilClass]
90
57
  def next_psm
91
- return nil if @file.pos >= @endbytepos
58
+ if @filehandle.pos >= @endbytepos
59
+ return nil
60
+ end
92
61
  # get the initial values for query & rank
93
- tmp = []
94
- tmp << @file.readline.chomp
95
- k,v = tmp[0].split "="
96
- # skip when there are no peptides (value equals -1)
97
- return nil if v == "-1"
98
-
99
- tmp[0] =~ /q(\d+)_p(\d+)/
100
- q = $1
101
- p = $2
102
-
103
- tmp_pos = @file.pos
104
- @file.each do |l|
105
- break if l =~ @boundary
62
+ buffer = [@filehandle.readline.chomp]
63
+ buffer[0] =~ /q(\d+)_p(\d+)/
64
+ q,p = $1, $2
65
+ @curr_psm = [q,p]
66
+ prev_pos = @filehandle.pos
67
+ @filehandle.each do |l|
68
+ l.chomp!
69
+ # break if we have reached the boundary
70
+ if l =~ @boundary
71
+ @endbytepos = @filehandle.pos - @dat.boundary_string.length
72
+ break
73
+ end
74
+ # break if we are on another PSM
106
75
  break unless l =~ /^q#{q}_p#{p}/
107
- tmp << l.chomp
108
- tmp_pos = @file.pos
76
+ buffer << l
77
+ prev_pos = @filehandle.pos
109
78
  end
110
- @file.pos = tmp_pos
111
-
112
- Mascot::DAT::PSM.parse(tmp)
79
+ # rewind the cursor to the last hit
80
+ @filehandle.pos = prev_pos
81
+ # return the new PSM
82
+ Mascot::DAT::PSM.new(buffer)
113
83
  end
114
84
 
115
85
  # Iterate through all of the {Mascot::DAT::PSM} entries in the DAT file.
116
- # @return Enumerator
86
+ # @yield [Mascot::DAT::PSM]
117
87
  def each
118
- @file.pos = @byteoffset
119
- while @file.pos < @endbytepos
120
- psm = next_psm()
121
- next if psm.nil?
88
+ self.rewind
89
+ while psm = self.next_psm
122
90
  yield psm
123
91
  end
124
92
  end
93
+
94
+ private
95
+ # Index the byte offsets of the PSMs
96
+ # @private
97
+ def index_psm_positions
98
+ # create an in-memroy index of PSM byteoffsets
99
+ q,p = 0,0
100
+ # move the cursor past the boundary line
101
+ @filehandle.readline
102
+ @filehandle.each do |line|
103
+ break if line =~ @dat.boundary
104
+ line =~ /q(\d+)_p(\d+)/
105
+ qq,pp= $1.to_i, $2.to_i
106
+ next if q == qq && p == pp
107
+ q,p = qq,pp
108
+ unless @psmidx.has_key?(q)
109
+ @psmidx[q] = {}
110
+ end
111
+ @psmidx[q][p] = @filehandle.pos - line.length
112
+ end
113
+ @endbytepos = @filehandle.pos - @dat.boundary_string.length
114
+ self.rewind
115
+ end
125
116
  end
126
117
  end
127
118
  end
@@ -1,81 +1,97 @@
1
1
  module Mascot
2
2
  class DAT
3
+ # A single Peptide Spectrum Match (PSM) result. In Mascot parlance, this is a
4
+ # match from a query (e.g. a single MS2 spectrum from a MGF file) to a given peptide. A query may match more than one
5
+ # peptide at a given score, and Mascot will report these in order of descending significance, or "rank".
6
+ #
7
+ # From the Mascot documentation, the following represents a reasonably complete PSM entry
8
+ # q1_p1_db=01 # two digit integer of the search DB index, zero filled and retarded.
9
+ # q1_p1=missed cleavages, (-1 indicates no match)
10
+ # peptide Mr,
11
+ # delta,
12
+ # number of ions matched,
13
+ # peptide string,
14
+ # peaks used from Ions1,
15
+ # variable modifications string,
16
+ # ions score,
17
+ # ion series found,
18
+ # peaks used from Ions2,
19
+ # peaks used from Ions3;
20
+ # "accession string":frame number:start:end:multiplicity, # data for first protein
21
+ # "accession string":frame number:start:end:multiplicity, # data for second protein, etc.
22
+ # q1_p1_et_mods=modification mass,
23
+ # neutral loss mass,
24
+ # modification description
25
+ # q1_p1_primary_nl=neutral loss string
26
+ # q1_p1_drange=startPos:endPos
27
+ # q1_p1_terms=residue,residue:residue,residue # flanking AA for each protien, in order
28
+ #
3
29
  class PSM
4
30
 
5
- ATTRS = [:query,:rank,:missed_cleavages,:mr, :delta,
6
- :num_ions_matched,:pep,:ions1,:var_mods_str,:score,
7
- :ion_series_str,:ions2,:ions3,:proteins,:dbs,:terms,:attrs]
31
+ attr_accessor :query
32
+ attr_accessor :rank
33
+ attr_accessor :missed_cleavages
34
+ attr_accessor :mr
35
+ attr_accessor :delta
36
+ attr_accessor :num_ions_matched
37
+ attr_accessor :pep
38
+ attr_accessor :ions1
39
+ attr_accessor :var_mods_str
40
+ attr_accessor :score
41
+ attr_accessor :ion_series_str
42
+ attr_accessor :ions2
43
+ attr_accessor :ions3
44
+ attr_accessor :proteins
45
+ attr_accessor :dbs
46
+ attr_accessor :terms
47
+ attr_accessor :attrs
8
48
 
9
- ATTRS.each do |a|
10
- attr_accessor a
49
+ # @param psm_entry [Array] The multi-line string entry from the Mascot DAT file
50
+ # @return [Mascot::DAT::PSM]
51
+ def initialize(psm_entry)
52
+ parse_entry(psm_entry)
11
53
  end
12
54
 
13
- def initialize(*opts)
14
- @attrs = {}
15
-
16
- if opts.kind_of? Hash
17
- opts.keys.each do |k|
18
- if ATTRS.index(k.to_sym)
19
- eval "@#{k} = #{opts[k]}"
20
- end
21
- end
22
- end
23
- end
24
-
25
- def ==(other)
26
- is_eql = true
27
- ATTRS.each do |a|
28
- if self.send(a) != other.send(a)
29
- is_eql = false
30
- break
31
- end
32
- end
33
- is_eql
34
- end
35
- def self.parse psm_arr
36
- psm_result = self.new()
37
- psm_arr.each do |l|
38
- next unless l =~ /^q/
39
-
55
+ private
56
+ # Parses the query entry multi-line string from the Mascot DAT file
57
+ # @private
58
+ # @param psm_entry [Array]
59
+ # @return [Mascot::DAT::PSM]
60
+ def parse_entry psm_entry
61
+ psm_entry.each do |l|
40
62
  k,v = l.split "="
41
63
  case k
42
64
  when /^q(\d+)_p(\d+)$/
43
- psm_result.query = $1.to_i
44
- psm_result.rank = $2.to_i
65
+ @query = $1.to_i
66
+ @rank = $2.to_i
45
67
  psm_vals, prots = v.split(";")
46
68
  psm_vals = psm_vals.split(',')
47
- psm_result.missed_cleavages= psm_vals[0].to_i
48
- psm_result.mr = psm_vals[1].to_f
49
- psm_result.delta = psm_vals[2].to_f
50
- psm_result.num_ions_matched = psm_vals[3].to_i
51
- psm_result.pep = psm_vals[4]
52
- psm_result.ions1 = psm_vals[5].to_i
53
- psm_result.var_mods_str = psm_vals[6]
54
- psm_result.score = psm_vals[7].to_f
55
- psm_result.ion_series_str = psm_vals[8]
56
- psm_result.ions2 = psm_vals[9].to_i
57
- psm_result.ions3 = psm_vals[10].to_i
69
+ @missed_cleavages= psm_vals[0].to_i
70
+ @mr = psm_vals[1].to_f
71
+ @delta = psm_vals[2].to_f
72
+ @num_ions_matched = psm_vals[3].to_i
73
+ @pep = psm_vals[4]
74
+ @ions1 = psm_vals[5].to_i
75
+ @var_mods_str = psm_vals[6]
76
+ @score = psm_vals[7].to_f
77
+ @ion_series_str = psm_vals[8]
78
+ @ions2 = psm_vals[9].to_i
79
+ @ions3 = psm_vals[10].to_i
58
80
 
59
- # assign proteins
60
- psm_result.proteins = prots.split(",").map do |pe|
81
+ # assign protein s
82
+ @proteins = prots.split(",").map do |pe|
61
83
  acc,*other_vals = pe.split(":")
62
84
  acc.gsub!(/\"/,'')
63
85
  [acc] + other_vals.map {|e| e.to_i }
64
86
  end
65
87
  when /db$/
66
88
  # split on 2 chars, call to_i
67
- psm_result.dbs = v.split(/(\d{2})/).grep(/^\d+$/) { |e| e.to_i }
89
+ @dbs = v.split(/(\d{2})/).grep(/^\d+$/).collect { |e| e.to_i }
68
90
  when /terms$/
69
91
  # for each protein, I have to add the term AA
70
- psm_result.terms = v.split(":").collect {|t| t.split(",") }
71
- else
72
- # returns the smaller key
73
- puts "****#{k}***"
74
- k_sym = k.slice(/q\d+_p\d+_?(.+)/,1).to_sym
75
- psm_result.attrs[k_sym] = v
92
+ @terms = v.split(":").collect {|t| t.split(",") }
76
93
  end
77
94
  end
78
- psm_result
79
95
  end
80
96
  end
81
97
  end
@@ -77,7 +77,7 @@ module Mascot
77
77
  when "index"
78
78
  @index = v.to_i
79
79
  when "rtinseconds"
80
- @rtinseconds = v.to_f
80
+ @rtinseconds = v.to_i
81
81
  when "charge"
82
82
  @charge = v
83
83
  when "mass_min"
@@ -1,6 +1,6 @@
1
1
  module Mascot
2
2
  class DAT
3
- VERSION = "0.2.1"
3
+ VERSION = "0.3.0"
4
4
  end
5
5
  end
6
6
 
@@ -52,4 +52,25 @@ class TestMascotDatPeptides < TestMascotDatHelper
52
52
  assert_equal([2], q1p1_psm.dbs)
53
53
  assert_equal([["R","Y"]], q1p1_psm.terms)
54
54
  end
55
+
56
+ def test_second_psm_from_next_psm
57
+ q1p1_psm = @peptides.psm(1,1)
58
+ q1p2_psm = @peptides.next_psm()
59
+ assert_equal(1, q1p2_psm.query)
60
+ assert_equal(2, q1p2_psm.rank)
61
+ assert_equal(0, q1p2_psm.missed_cleavages)
62
+ assert_equal(476.223068, q1p2_psm.mr)
63
+ assert_equal(-0.940226, q1p2_psm.delta)
64
+ assert_equal(4, q1p2_psm.num_ions_matched)
65
+ assert_equal("GGESK", q1p2_psm.pep)
66
+ assert_equal(9, q1p2_psm.ions1)
67
+ assert_equal("0000000", q1p2_psm.var_mods_str)
68
+ assert_equal(13.29, q1p2_psm.score)
69
+ assert_equal("0000002020000000000", q1p2_psm.ion_series_str)
70
+ assert_equal(0, q1p2_psm.ions2)
71
+ assert_equal(0, q1p2_psm.ions3)
72
+ assert_equal([["P70298", 0, 605, 609, 1]], q1p2_psm.proteins)
73
+ assert_equal([2], q1p2_psm.dbs)
74
+ assert_equal([["K","N"]], q1p2_psm.terms)
75
+ end
55
76
  end
@@ -12,7 +12,7 @@ class TestMascotDatQuery < TestMascotDatHelper
12
12
  assert_equal("281.832701459371_513",@query.title)
13
13
  end
14
14
  def test_rtinseconds
15
- assert_equal(513.0, @query.rtinseconds)
15
+ assert_equal(513, @query.rtinseconds)
16
16
  end
17
17
  def test_index
18
18
  assert_equal(30,@query.index)
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: mascot-dat
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.2.1
4
+ version: 0.3.0
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors:
@@ -13,7 +13,7 @@ date: 2012-07-10 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: rake
16
- requirement: &70341777412840 !ruby/object:Gem::Requirement
16
+ requirement: &70273728840180 !ruby/object:Gem::Requirement
17
17
  none: false
18
18
  requirements:
19
19
  - - ! '>='
@@ -21,10 +21,10 @@ dependencies:
21
21
  version: '0'
22
22
  type: :development
23
23
  prerelease: false
24
- version_requirements: *70341777412840
24
+ version_requirements: *70273728840180
25
25
  - !ruby/object:Gem::Dependency
26
26
  name: yard
27
- requirement: &70341777410100 !ruby/object:Gem::Requirement
27
+ requirement: &70273728839140 !ruby/object:Gem::Requirement
28
28
  none: false
29
29
  requirements:
30
30
  - - ! '>='
@@ -32,7 +32,7 @@ dependencies:
32
32
  version: '0'
33
33
  type: :development
34
34
  prerelease: false
35
- version_requirements: *70341777410100
35
+ version_requirements: *70273728839140
36
36
  description: Mascot DAT file format parser
37
37
  email:
38
38
  - angel@upenn.edu
@@ -88,7 +88,7 @@ required_ruby_version: !ruby/object:Gem::Requirement
88
88
  version: '0'
89
89
  segments:
90
90
  - 0
91
- hash: 4478337334591189647
91
+ hash: 3863437592712259051
92
92
  required_rubygems_version: !ruby/object:Gem::Requirement
93
93
  none: false
94
94
  requirements:
@@ -97,7 +97,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
97
97
  version: '0'
98
98
  segments:
99
99
  - 0
100
- hash: 4478337334591189647
100
+ hash: 3863437592712259051
101
101
  requirements: []
102
102
  rubyforge_project:
103
103
  rubygems_version: 1.8.11