mspire-mascot-dat 0.0.1

Sign up to get free protection for your applications and to get access to all the features.
data/.document ADDED
@@ -0,0 +1,5 @@
1
+ lib/**/*.rb
2
+ bin/*
3
+ -
4
+ features/**/*.feature
5
+ LICENSE.txt
data/.rspec ADDED
@@ -0,0 +1 @@
1
+ --color
data/LICENSE.txt ADDED
@@ -0,0 +1,21 @@
1
+ Copyright (c) 2013 Brigham Young University
2
+ Authored by: John T. Prince
3
+
4
+ Permission is hereby granted, free of charge, to any person obtaining
5
+ a copy of this software and associated documentation files (the
6
+ "Software"), to deal in the Software without restriction, including
7
+ without limitation the rights to use, copy, modify, merge, publish,
8
+ distribute, sublicense, and/or sell copies of the Software, and to
9
+ permit persons to whom the Software is furnished to do so, subject to
10
+ the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be
13
+ included in all copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
16
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
17
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
18
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
19
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
20
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
21
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
data/README.md ADDED
@@ -0,0 +1,46 @@
1
+ # mspire-mascot-dat
2
+
3
+ Reads the mascot search engine .dat results file.
4
+
5
+ ## Examples
6
+
7
+ ```ruby
8
+ require 'mspire-mascot-dat'
9
+
10
+ Mspire::Mascot::Dat.open(file.dat) do |dat|
11
+ dat.sections # => [:parameters, :masses, :unimod, :enzyme ...]
12
+ end
13
+
14
+ ```
15
+ ### each peptide
16
+
17
+ #### every peptide hit
18
+
19
+ ```ruby
20
+ dat.each_peptide do |pephit|
21
+ pephit.missed_cleavages # => an Integer
22
+ pephit.ions_score # => a Float
23
+ ...
24
+ end
25
+ ```
26
+
27
+ #### every decoy peptide hit
28
+
29
+ ```ruby
30
+ dat.each_peptide(false) {|pephit| ... }
31
+ ```
32
+
33
+ #### each top peptide hit
34
+
35
+ ```ruby
36
+ dat.each_peptide(true, 1) {|pephit| ... }
37
+ ```
38
+
39
+ ## Further Info
40
+
41
+ See Mascot's Installation & Setup Manual' for detailed information about the
42
+ .dat format itself.
43
+
44
+ ## Copyright
45
+
46
+ MIT. See LICENSE.txt
data/Rakefile ADDED
@@ -0,0 +1,39 @@
1
+ # encoding: utf-8
2
+
3
+ require 'rubygems'
4
+ require 'rake'
5
+
6
+ require 'jeweler'
7
+ Jeweler::Tasks.new do |gem|
8
+ # gem is a Gem::Specification... see http://docs.rubygems.org/read/chapter/20 for more options
9
+ gem.name = "mspire-mascot-dat"
10
+ gem.homepage = "http://github.com/princelab/mspire-mascot-dat"
11
+ gem.license = "MIT"
12
+ gem.summary = %Q{Reads mascot dat files for mspire library.}
13
+ gem.description = %Q{Reads mascot dat files with gusto for mspire library.}
14
+ gem.email = "jtprince@gmail.com"
15
+ gem.authors = ["John T. Prince"]
16
+ gem.add_dependency "elif", "~> 0.1.0"
17
+ gem.add_development_dependency "rspec", "~> 2.8.0"
18
+ gem.add_development_dependency "rdoc", "~> 3.12"
19
+ gem.add_development_dependency "jeweler", "~> 1.8.4"
20
+ end
21
+ Jeweler::RubygemsDotOrgTasks.new
22
+
23
+ require 'rspec/core'
24
+ require 'rspec/core/rake_task'
25
+ RSpec::Core::RakeTask.new(:spec) do |spec|
26
+ spec.pattern = FileList['spec/**/*_spec.rb']
27
+ end
28
+
29
+ task :default => :spec
30
+
31
+ require 'rdoc/task'
32
+ Rake::RDocTask.new do |rdoc|
33
+ version = File.exist?('VERSION') ? File.read('VERSION') : ""
34
+
35
+ rdoc.rdoc_dir = 'rdoc'
36
+ rdoc.title = "mspire-mascot-dat #{version}"
37
+ rdoc.rdoc_files.include('README*')
38
+ rdoc.rdoc_files.include('lib/**/*.rb')
39
+ end
data/VERSION ADDED
@@ -0,0 +1 @@
1
+ 0.0.1
@@ -0,0 +1,93 @@
1
+ require 'elif'
2
+
3
+ module Mspire
4
+ module Mascot
5
+ class Dat
6
+ # makes a byte index (not line index)
7
+ class Index
8
+
9
+ # the hash holding the start byte for each section (besides the
10
+ # queries)
11
+ attr_accessor :byte_num
12
+
13
+ # the array holding the start byte for each query. It is indexed by
14
+ # query number, so the first
15
+ attr_accessor :query_num_to_byte
16
+
17
+ # an array of the query nums
18
+ attr_accessor :query_nums
19
+
20
+ def initialize(io=nil)
21
+ @byte_num = {}
22
+ @query_num_to_byte = []
23
+ @query_nums = []
24
+ from_io(io) if io
25
+ end
26
+
27
+ def has_queries?
28
+ @query_nums.size > 0
29
+ end
30
+
31
+ # returns self
32
+ def from_io(io)
33
+ io.rewind
34
+ while line=io.gets
35
+ io.each_line do |line|
36
+ if md=/^Content-Type: application\/x-Mascot; name=["'](\w+)["']/.match(line)
37
+ head = md[1]
38
+ io.gets # the newline
39
+ pos = io.pos
40
+
41
+ if qmd=/query(\d+)/.match(head)
42
+ query_num = qmd[1].to_i
43
+ @query_nums << query_num
44
+ @query_num_to_byte[query_num] = pos
45
+ else
46
+ @byte_num[head] = pos
47
+ end
48
+ end
49
+ end
50
+ end
51
+ io.rewind
52
+
53
+ @query_nums.freeze
54
+ @query_num_to_byte.freeze
55
+ @byte_num.freeze
56
+ self
57
+ end
58
+
59
+ # given a string or symbol, looks up the start line. Given an
60
+ # Integer, assumes it is a query num and returns the start line of the
61
+ # query number.
62
+ def [](key)
63
+ if key.is_a?(Integer)
64
+ @query_num_to_byte[key]
65
+ else
66
+ @byte_num[key.to_s]
67
+ end
68
+ end
69
+
70
+ # nil if the query is out of bounds
71
+ def query(n)
72
+ @query_num_to_byte[n]
73
+ end
74
+
75
+ end
76
+ end
77
+ end
78
+ end
79
+
80
+ #--gc0p4Jq0M2Yt08jU534c0p
81
+ #Content-Type: application/x-Mascot; name="index"
82
+
83
+ #parameters=4
84
+ #masses=78
85
+ #unimod=119
86
+ #enzyme=484
87
+ #header=492
88
+ #summary=507
89
+ #peptides=820
90
+ #proteins=853
91
+ #query1=858
92
+ #query2=871
93
+ #--gc0p4Jq0M2Yt08jU534c0p--
@@ -0,0 +1,95 @@
1
+
2
+ module Mspire
3
+ module Mascot
4
+ class Dat
5
+ # mr = relative molecular mass; data contains keys of relative
6
+ Peptide = Struct.new(:missed_cleavages, :mr, :delta, :num_ions_matched, :seq, :peaks_from_ions_1, :var_mods_string, :ions_score, :ion_series_found, :peaks_from_ions_2, :peaks_from_ions_3, :query_num, :peptide_num, :proteins, :data) do
7
+ CAST = [:int, :float, :float, :int, :string, :int, :string, :float, :string, :int, :int]
8
+
9
+ # h49_q2 => [49, 2]; q2_p4_primary_nl => [2, 4, 'primary_nl]
10
+ def self.qnum_pnum(string)
11
+ (qns, pns, other) = string.split('_', 3)
12
+ [ *[qns, pns].map {|ns| ns[1..-1].to_i }, other ]
13
+ end
14
+ # takes an io object (positioned at the beginning of a peptide hit)
15
+ # and reads off the next peptide hit "q1_p1=0,798.23...". Returns nil
16
+ # if it reaches the end of the section or it is a blank line
17
+ def self.from_io(io, proteins=false, data=false)
18
+ finished = ->(line) { line.size < 2 || line[0,2] == '--' }
19
+ line = io.readline("\n")
20
+ if finished[line]
21
+ nil
22
+ else
23
+ (qp, core, protein_info) = line.split(/[=;]/)
24
+ (qnum, pnum, _) = qnum_pnum(qp)
25
+ vals = core.split(',').zip(CAST).map do |val, cast|
26
+ case cast
27
+ when :int then val.to_i
28
+ when :float then val.to_f
29
+ else
30
+ val
31
+ end
32
+ end
33
+ pephit = self.new(*vals, qnum, pnum)
34
+ raise NotImplementedError, "not reading proteins or data yet" if proteins || data
35
+ loop do
36
+ before = io.pos
37
+ line = io.readline("\n")
38
+ if finished[line]
39
+ io.pos = before
40
+ break
41
+ end
42
+ (qp, string) = line.split('=')
43
+ (qnum, pnum, other) = qnum_pnum(qp)
44
+ if pephit.peptide_num != pnum || pephit.query_num != qnum
45
+ io.pos = before
46
+ break
47
+ end
48
+ end
49
+ pephit
50
+ end
51
+ end
52
+ end
53
+ end
54
+ end
55
+ end
56
+
57
+ =begin
58
+ q1_p1=missed cleavages, (–1 indicates no match)
59
+ peptide Mr,
60
+ delta,
61
+ number of ions matched,
62
+ peptide string,
63
+ peaks used from Ions1,
64
+ variable modifications string,
65
+ ions score,
66
+ ion series found,
67
+ peaks used from Ions2,
68
+ peaks used from Ions3;
69
+ “accession string”: data for first protein
70
+ frame number:
71
+ start:
72
+ end:
73
+ multiplicity,
74
+ “accession string”: data for second protein
75
+ frame number:
76
+ start:
77
+ end:
78
+ multiplicity,
79
+ etc.
80
+ q1_p1_et_mods=modification mass,
81
+ neutral loss mass,
82
+ modification description
83
+ q1_p1_et_mods_master=neutral loss mass[[,neutral loss mass] ... ]
84
+ q1_p1_et_mods_slave=neutral loss mass[[,neutral loss mass] ... ]
85
+ q1_p1_primary_nl=neutral loss string
86
+ q1_p1_na_diff=original NA sequence,
87
+ modified NA sequence
88
+ q1_p1_tag=tagNum:startPos:endPos:seriesID,...
89
+ q1_p1_drange=startPos:endPos
90
+ q1_p1_terms=residue,residue[[:residue,residue] ... ]
91
+ q1_p1_subst=pos1,ambig1,matched1 ... ,posn,ambign,matchedn
92
+ q1_p1_comp=quantitation component name
93
+ q1_p2=...
94
+ =end
95
+
@@ -0,0 +1,69 @@
1
+ require 'ostruct'
2
+ require 'delegate'
3
+ require 'cgi'
4
+
5
+ module Mspire
6
+ module Mascot
7
+ class Dat
8
+ class Query < Hash
9
+
10
+ CAST = {
11
+ charge: ->(st) { (st[-1] << st[0...-1]).to_i },
12
+ title: ->(st) { CGI.unescape(st) },
13
+ mass_min: :to_f,
14
+ mass_max: :to_f,
15
+ int_min: :to_f,
16
+ int_max: :to_f,
17
+ num_vals: :to_i,
18
+ num_used1: :to_i,
19
+ index: :to_i,
20
+ Ions1: ->(st) { st.split(',').map {|pair_s| pair_s.split(':').map(&:to_f) } },
21
+ }
22
+
23
+ # returns self
24
+ def self.from_io(io)
25
+ query = self.new
26
+ while line = io.gets
27
+ break if line[0,2] == '--'
28
+ line.chomp!
29
+ (key, val) = line.split('=')
30
+ query[key.to_sym] = val
31
+ end
32
+ query.each do |k,v|
33
+ if cast=CAST[k]
34
+ apply = cast.is_a?(Symbol) ? cast.to_proc : cast
35
+ query[k] = apply[v] if apply
36
+ end
37
+ end
38
+ query
39
+ end
40
+
41
+ def method_missing(*args, &block)
42
+ if args[0].to_s[-1] == '='
43
+ if self.key?(args[0...-1])
44
+ self[ args[0...-1] ] = args[1]
45
+ else
46
+ super(*args, &block)
47
+ end
48
+ else
49
+ if self.key?(args[0])
50
+ self[ args[0] ]
51
+ else
52
+ super(*args, &block)
53
+ end
54
+ end
55
+ end
56
+ end
57
+ end
58
+ end
59
+ end
60
+
61
+ #index=1
62
+ #charge=2+
63
+ #mass_min=305.484440
64
+ #mass_max=1998.945430
65
+ #int_min=1.5
66
+ #int_max=747
67
+ #num_vals=3
68
+ #num_used1=-1
69
+
@@ -0,0 +1,59 @@
1
+ require 'mspire/mascot/dat/index'
2
+ require 'mspire/mascot/dat/peptide'
3
+ require 'mspire/mascot/dat/query'
4
+
5
+ module Mspire
6
+ module Mascot
7
+ class Dat
8
+ # the io object which is the open dat file
9
+ attr_accessor :io
10
+
11
+ # the index object which points to the start byte for each section
12
+ attr_accessor :index
13
+
14
+ def initialize(io)
15
+ @io = io
16
+ @index = Index.new(@io)
17
+ end
18
+
19
+ def self.open(file, &block)
20
+ io = File.open(file)
21
+ block.call(self.new(io))
22
+ io.close
23
+ end
24
+
25
+ # positions io at the beginning of the section data (past the Content
26
+ # type and blank line). If given an integer, interprets it as a query
27
+ # number. returns self
28
+ def start_section!(name)
29
+ @io.pos = @index[name]
30
+ self
31
+ end
32
+
33
+ def query(n)
34
+ start_section!(n)
35
+ Query.from_io(@io)
36
+ end
37
+
38
+ def each_peptide(non_decoy=true, top_n=Float::INFINITY, &block)
39
+ block or return enum_for(__method__, non_decoy, top_n)
40
+ start_section!(non_decoy ? :peptides : :decoy_peptides)
41
+ while peptide = Peptide.from_io(@io)
42
+ block.call(peptide) if peptide.peptide_num <= top_n
43
+ end
44
+ end
45
+
46
+ # returns a list of all sections as symbols. The symbol :queries is
47
+ # returned rather than each query individually if their is 1 or more
48
+ # queries.
49
+ def sections
50
+ reply = @index.byte_num.keys
51
+ if @index.has_queries?
52
+ reply.push('queries')
53
+ end
54
+ reply.map(&:to_sym)
55
+ end
56
+
57
+ end
58
+ end
59
+ end
@@ -0,0 +1,70 @@
1
+ # Generated by jeweler
2
+ # DO NOT EDIT THIS FILE DIRECTLY
3
+ # Instead, edit Jeweler::Tasks in Rakefile, and run 'rake gemspec'
4
+ # -*- encoding: utf-8 -*-
5
+
6
+ Gem::Specification.new do |s|
7
+ s.name = "mspire-mascot-dat"
8
+ s.version = "0.0.1"
9
+
10
+ s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
11
+ s.authors = ["John T. Prince"]
12
+ s.date = "2013-03-28"
13
+ s.description = "Reads mascot dat files with gusto for mspire library."
14
+ s.email = "jtprince@gmail.com"
15
+ s.extra_rdoc_files = [
16
+ "LICENSE.txt",
17
+ "README.md"
18
+ ]
19
+ s.files = [
20
+ ".document",
21
+ ".rspec",
22
+ "LICENSE.txt",
23
+ "README.md",
24
+ "Rakefile",
25
+ "VERSION",
26
+ "lib/mspire/mascot/dat.rb",
27
+ "lib/mspire/mascot/dat/index.rb",
28
+ "lib/mspire/mascot/dat/peptide.rb",
29
+ "lib/mspire/mascot/dat/query.rb",
30
+ "mspire-mascot-dat.gemspec",
31
+ "spec/mspire/mascot/dat/index_spec.rb",
32
+ "spec/mspire/mascot/dat/peptide_spec.rb",
33
+ "spec/mspire/mascot/dat/query_spec.rb",
34
+ "spec/mspire/mascot/dat_spec.rb",
35
+ "spec/reference/dat_format_reference.md",
36
+ "spec/reference/two_spectra_decoy_F004129.png",
37
+ "spec/reference/two_spectra_no_decoy_F004128.png",
38
+ "spec/spec_helper.rb",
39
+ "spec/testfiles/F004128.dat",
40
+ "spec/testfiles/F004129.dat",
41
+ "spec/testfiles/two_spectra.mgf"
42
+ ]
43
+ s.homepage = "http://github.com/princelab/mspire-mascot-dat"
44
+ s.licenses = ["MIT"]
45
+ s.require_paths = ["lib"]
46
+ s.rubygems_version = "1.8.23"
47
+ s.summary = "Reads mascot dat files for mspire library."
48
+
49
+ if s.respond_to? :specification_version then
50
+ s.specification_version = 3
51
+
52
+ if Gem::Version.new(Gem::VERSION) >= Gem::Version.new('1.2.0') then
53
+ s.add_runtime_dependency(%q<elif>, ["~> 0.1.0"])
54
+ s.add_development_dependency(%q<rspec>, ["~> 2.8.0"])
55
+ s.add_development_dependency(%q<rdoc>, ["~> 3.12"])
56
+ s.add_development_dependency(%q<jeweler>, ["~> 1.8.4"])
57
+ else
58
+ s.add_dependency(%q<elif>, ["~> 0.1.0"])
59
+ s.add_dependency(%q<rspec>, ["~> 2.8.0"])
60
+ s.add_dependency(%q<rdoc>, ["~> 3.12"])
61
+ s.add_dependency(%q<jeweler>, ["~> 1.8.4"])
62
+ end
63
+ else
64
+ s.add_dependency(%q<elif>, ["~> 0.1.0"])
65
+ s.add_dependency(%q<rspec>, ["~> 2.8.0"])
66
+ s.add_dependency(%q<rdoc>, ["~> 3.12"])
67
+ s.add_dependency(%q<jeweler>, ["~> 1.8.4"])
68
+ end
69
+ end
70
+
@@ -0,0 +1,44 @@
1
+ require 'spec_helper'
2
+
3
+ require 'mspire/mascot/dat/index'
4
+
5
+ describe 'Mspire::Mascot::Dat::Index being initialized from file' do
6
+
7
+ let(:io) { File.open(TESTFILES + "/F004128.dat") }
8
+
9
+ specify '#initialize(io) creates the index object' do
10
+ Mspire::Mascot::Dat::Index.new(io).should be_a(Mspire::Mascot::Dat::Index)
11
+ end
12
+
13
+ describe Mspire::Mascot::Dat::Index do
14
+ subject { Mspire::Mascot::Dat::Index.new(io) }
15
+
16
+ it 'can access the header start byte nums' do
17
+
18
+ {
19
+ :parameters => 196,
20
+ :masses => 1203,
21
+ :unimod => 1873,
22
+ :enzyme => 20540,
23
+ :header => 20661,
24
+ :summary => 21103,
25
+ :peptides => 41624,
26
+ :proteins => 43894,
27
+ }.each {|head,val| subject[head].should == val }
28
+
29
+ end
30
+
31
+ it 'can get the queries' do
32
+ [1,2].zip([44129, 51186]) do |num, line_num|
33
+ subject.query(num).should == line_num
34
+ end
35
+ end
36
+
37
+ it 'can be accessed with brackets like array or hash' do
38
+ subject[1].should == 44129
39
+ subject[:peptides].should == 41624
40
+ subject['peptides'].should == 41624
41
+ end
42
+
43
+ end
44
+ end
@@ -0,0 +1,24 @@
1
+ require 'spec_helper'
2
+
3
+ require 'mspire/mascot/dat/peptide'
4
+
5
+ describe 'reading off a Peptide' do
6
+
7
+ before(:all) do
8
+ info = <<STRING
9
+ q2_p1=0,2978.269196,1.195840,5,MDSSSGSQGNGSFMDQNSLGILNMDNLK,17,000000000000001000000000100000,4.11,0002000020000000000,0,0;"Q9VV79":0:1:28:1
10
+ q2_p1_terms=-,V
11
+ q2_p1_primary_nl=000000000000002000000000200000
12
+ q2_p2=1,2979.449478,0.015558,5,STGAESSEEXLREAYIMASVEHVNLLK,45,00000000000000000100000000000,2.84,0000000020000000000,0,0;"Q6SAG3":0:875:901:1
13
+ STRING
14
+ @io = StringIO.new(info)
15
+ end
16
+
17
+ it 'works' do
18
+ peptide = Mspire::Mascot::Dat::Peptide.from_io(@io)
19
+ { missed_cleavages: 0, mr: 2978.269196, delta: 1.19584, num_ions_matched: 5, seq: 'MDSSSGSQGNGSFMDQNSLGILNMDNLK', peaks_from_ions_1: 17, var_mods_string: '000000000000001000000000100000', ions_score: 4.11, ion_series_found: '0002000020000000000', peaks_from_ions_2: 0, peaks_from_ions_3: 0, query_num: 2, peptide_num: 1, proteins: nil, data: nil }.each do |k,v|
20
+ peptide.send(k).should == v
21
+ end
22
+ end
23
+
24
+ end
@@ -0,0 +1,33 @@
1
+ require 'spec_helper'
2
+
3
+ require 'mspire/mascot/dat/query'
4
+
5
+ describe 'creating a query object' do
6
+
7
+ before(:all) do
8
+ # this is hacked up to be much smaller, so don't look for consistency
9
+ # between the data and the spectrum
10
+ data = <<END
11
+ title=1%2e2746%2e2746%2e2
12
+ index=1
13
+ charge=2-
14
+ mass_min=305.484440
15
+ mass_max=1998.945430
16
+ int_min=1.5
17
+ int_max=747
18
+ num_vals=3
19
+ num_used1=-1
20
+ Ions1=371.460240:10.3,486.498990:15.7,538.381160:24.9
21
+ --gc0p4Jq0M2Yt08jU534c0p
22
+ Content-Type: application/x-Mascot; name="query2"
23
+ END
24
+ @io = StringIO.new(data)
25
+ end
26
+
27
+ it 'parses' do
28
+ query = Mspire::Mascot::Dat::Query.from_io(@io)
29
+ query.title.should == '1.2746.2746.2'
30
+ query.charge.should == -2
31
+ end
32
+ end
33
+