ms-ident 0.0.18 → 0.0.19

Sign up to get free protection for your applications and to get access to all the features.
data/Rakefile CHANGED
@@ -8,7 +8,7 @@ Jeweler::Tasks.new do |gem|
8
8
  gem.homepage = "http://github.com/jtprince/ms-ident"
9
9
  gem.license = "MIT"
10
10
  gem.summary = %Q{mspire library for working with mzIdentML and pepxml}
11
- gem.description = %Q{mspire library for working with mzIdentML and pepxml}
11
+ gem.description = %Q{mspire library for working with mzIdentML, pepxml, and related.}
12
12
  gem.email = "jtprince@gmail.com"
13
13
  gem.authors = ["John T. Prince"]
14
14
  gem.rubyforge_project = 'mspire'
data/VERSION CHANGED
@@ -1 +1 @@
1
- 0.0.18
1
+ 0.0.19
@@ -5,7 +5,11 @@ module Ms ; end
5
5
  module Ms::Ident ; end
6
6
  module Ms::Ident::Peptide ; end
7
7
 
8
- module Ms::Ident::Peptide::Db
8
+ # the object itself is a modified Hash.
9
+ # It is initialized with the database file and a protein array can be
10
+ # retrieved with the #[] method given an amino acid sequence. All other
11
+ # methods are untested at this time and should be avoided!
12
+ class Ms::Ident::Peptide::Db < Hash
9
13
  MAX_NUM_AA_EXPANSION = 3
10
14
 
11
15
  # the twenty standard amino acids
@@ -165,12 +169,24 @@ module Ms::Ident::Peptide::Db
165
169
  to_expand
166
170
  end
167
171
 
172
+ def initialize(db_file)
173
+ self.replace(YAML.load_file(db_file))
174
+ end
175
+
176
+ alias_method :old_bracket, '[]'.to_sym
177
+
178
+ # returns the protein id's as an array
179
+ def [](key)
180
+ old_bracket(key).chomp.split(PROTEIN_DELIMITER)
181
+ end
182
+
168
183
  # an object for on disk retrieval of db entries
169
184
  # proteins are returned as an array.
170
185
  # behaves much like a hash once it is opened.
171
186
  class IO
172
187
  include Enumerable
173
188
  def self.open(filename, &block)
189
+ raise ArgumentError unless block
174
190
  File.open(filename) do |io|
175
191
  block.call(self.new(io))
176
192
  end
@@ -192,9 +208,11 @@ module Ms::Ident::Peptide::Db
192
208
  @index[key] = [start, end_pos-start]
193
209
  end
194
210
  end
211
+
195
212
  # returns an array of proteins for the given key (peptide aaseq)
196
213
  def [](key)
197
214
  (start, length) = @index[key]
215
+ return nil unless start
198
216
  @io.seek(start)
199
217
  string = @io.read(length)
200
218
  string.chomp!
@@ -0,0 +1,55 @@
1
+ require 'ms/ident/peptide_hit'
2
+
3
+ module Ms ; end
4
+ module Ms::Ident ; end
5
+
6
+ class Ms::Ident::PeptideHit
7
+ module Qvalue
8
+ attr_accessor :qvalue
9
+ FILE_EXTENSION = '.phq.tsv'
10
+ FILE_DELIMITER = "\t"
11
+ HEADER = %w(aaseq charge qvalue)
12
+
13
+ class << self
14
+
15
+ # writes to the file, adding an extension
16
+ def to_phq(base, hits, qvalues=nil)
17
+ to_file(base + FILE_EXTENSION, hits)
18
+ end
19
+
20
+ # writes the peptide hits to a phq.tsv file. qvalues is a parallel array
21
+ # to hits that can provide qvalues if not inherent to the hits
22
+ # returns the filename.
23
+ def to_file(filename, hits, qvalues=[])
24
+ File.open(filename,'w') do |out|
25
+ out.puts HEADER.join(FILE_DELIMITER)
26
+ hits.zip(qvalues) do |hit, qvalue|
27
+ out.puts [hit.aaseq, hit.charge, qvalue || hit.qvalue].join(FILE_DELIMITER)
28
+ end
29
+ end
30
+ filename
31
+ end
32
+
33
+ # returns an array of PeptideHit objects from a phq.tsv
34
+ def from_file(filename)
35
+ peptide_hits = []
36
+ File.open(filename) do |io|
37
+ header = io.readline.chomp.split(FILE_DELIMITER)
38
+ raise "bad headers" unless header == HEADER
39
+ io.each do |line|
40
+ line.chomp!
41
+ (aaseq, charge, qvalue) = line.split(FILE_DELIMITER)
42
+ ph = Ms::Ident::PeptideHit.new
43
+ ph.aaseq = aaseq ; ph.charge = charge.to_i ; ph.qvalue = qvalue.to_f
44
+ peptide_hits << ph
45
+ end
46
+ end
47
+ peptide_hits
48
+ end
49
+
50
+ alias_method :from_phq, :from_file
51
+
52
+ end
53
+ end # Qvalue
54
+ include Qvalue
55
+ end # Peptide Hit
@@ -0,0 +1,8 @@
1
+ module Ms ; end
2
+ module Ms::Ident ; end
3
+
4
+ class Ms::Ident::PeptideHit
5
+ attr_accessor :aaseq
6
+ attr_accessor :charge
7
+ attr_accessor :proteins
8
+ end
@@ -1,70 +1,12 @@
1
1
  module Ms ; end
2
2
  module Ms::Ident ; end
3
3
 
4
- require 'set'
5
4
 
6
5
  module Ms::Ident::Protein
7
-
8
- class << self
9
- end
10
-
11
6
  # gives the information up until the first space or carriage return.
12
7
  # Assumes the protein can respond_to? :reference
13
8
  def first_entry
14
9
  reference.split(/[\s\r]/)[0]
15
10
  end
16
-
17
- PRIORITIZE_PROTEINS = lambda do |protein_group_and_peptide_hits|
18
- peptide_hits = protein_group_and_peptide_hits.last
19
- num_uniq_aaseqs = peptide_hits.map {|hit| hit.aaseq }.uniq.size
20
- num_uniq_aaseqs_at_z = peptide_hits.map {|hit| [hit.aaseq, hit.charge] }.uniq.size
21
- [num_uniq_aaseqs, num_uniq_aaseqs_at_z, peptide_hits.size]
22
- end
23
-
24
-
25
- module_function
26
- # greedy algorithm to map a set of peptide_hits to protein groups. each
27
- # peptide hit should respond to :aaseq, :charge, :proteins if a block is
28
- # given, yields a single argument: a doublet of protein_group and peptide
29
- # set. It expects a metric or array to sort by for creating greedy protein
30
- # groups (the greediest proteins should sort to the back of the array). if
31
- # no block is given, the groups are sorted by [# uniq aaseqs, # uniq
32
- # aaseq+charge, # peptide_hits] (see PRIORITIZE_PROTEINS). Sets of
33
- # peptide_hits and the objects returned by peptide_hit#proteins are used as
34
- # hash keys. As long as each peptide hit has a unique signature (like an
35
- # id) then any object will work. If they are Struct objects, you might
36
- # consider redefining the #hash method to be object_id for performance and
37
- # accuracy.
38
- def peptide_hits_to_protein_groups(peptide_hits, &sort_by)
39
- sort_by ||= PRIORITIZE_PROTEINS
40
- # note to self: I wrote this in 2011, so I think I know what I'm doing now
41
- protein_to_peptides = Hash.new {|h,k| h[k] = Set.new }
42
- peptide_hits.each do |peptide_hit|
43
- peptide_hit.proteins.each do |protein|
44
- protein_to_peptides[protein] << peptide_hit
45
- end
46
- end
47
- peptides_to_protein_group = Hash.new {|h,k| h[k] = [] }
48
- protein_to_peptides.each do |protein, peptide_set|
49
- peptides_to_protein_group[peptide_set] << protein
50
- end
51
- protein_group_to_peptides = peptides_to_protein_group.invert
52
- greedy_first = protein_group_to_peptides.sort_by(&sort_by).reverse
53
- accounted_for = Set.new
54
- surviving_protein_groups = []
55
- # we are discarding the subsumed sets, but we could get them with
56
- # partition
57
- greedy_first.select do |group, peptide_set|
58
- has_an_unaccounted_peptide = false
59
- peptide_set.each do |peptide_hit|
60
- unless accounted_for.include?(peptide_hit)
61
- has_an_unaccounted_peptide = true
62
- accounted_for.add(peptide_hit)
63
- end
64
- end
65
- has_an_unaccounted_peptide
66
- end
67
- end
68
-
69
11
  end
70
12
 
@@ -0,0 +1,72 @@
1
+ require 'set'
2
+
3
+ module Ms
4
+ module Ident
5
+ # represents a group of proteins, typically indistinguishable in the
6
+ # experiment.
7
+ class ProteinGroup < Array
8
+ attr_accessor :peptide_hits
9
+
10
+ PRIORITIZE_PROTEINS = lambda do |protein_group_and_peptide_hits|
11
+ peptide_hits = protein_group_and_peptide_hits.last
12
+ num_uniq_aaseqs = peptide_hits.map {|hit| hit.aaseq }.uniq.size
13
+ num_uniq_aaseqs_at_z = peptide_hits.map {|hit| [hit.aaseq, hit.charge] }.uniq.size
14
+ [num_uniq_aaseqs, num_uniq_aaseqs_at_z, peptide_hits.size]
15
+ end
16
+
17
+ # greedy algorithm to map a set of peptide_hits to protein groups. each
18
+ # peptide hit should respond to :aaseq, :charge, :proteins if a block is
19
+ # given, yields a single argument: a doublet of protein_group and peptide
20
+ # set. It expects a metric or array to sort by for creating greedy protein
21
+ # groups (the greediest proteins should sort to the back of the array). if
22
+ # no block is given, the groups are sorted by [# uniq aaseqs, # uniq
23
+ # aaseq+charge, # peptide_hits] (see PRIORITIZE_PROTEINS). Sets of
24
+ # peptide_hits and the objects returned by peptide_hit#proteins are used as
25
+ # hash keys. As long as each peptide hit has a unique signature (like an
26
+ # id) then any object will work. If they are Struct objects, you might
27
+ # consider redefining the #hash method to be object_id for performance and
28
+ # accuracy.
29
+ #
30
+ # returns an array of ProteinGroup objects, each set with :peptide_hits
31
+ def self.peptide_hits_to_protein_groups(peptide_hits, &sort_by)
32
+ sort_by ||= PRIORITIZE_PROTEINS
33
+ # note to self: I wrote this in 2011, so I think I know what I'm doing now
34
+ protein_to_peptides = Hash.new {|h,k| h[k] = Set.new }
35
+ peptide_hits.each do |peptide_hit|
36
+ peptide_hit.proteins.each do |protein|
37
+ protein_to_peptides[protein] << peptide_hit
38
+ end
39
+ end
40
+ peptides_to_protein_group = Hash.new {|h,k| h[k] = [] }
41
+ protein_to_peptides.each do |protein, peptide_set|
42
+ peptides_to_protein_group[peptide_set] << protein
43
+ end
44
+ peptides_to_protein_group.each do |pephits,ar_of_prots|
45
+ pg = Ms::Ident::ProteinGroup.new(ar_of_prots)
46
+ pg.peptide_hits = pephits
47
+ peptides_to_protein_group[pephits] = pg
48
+ end
49
+
50
+ protein_group_to_peptides = peptides_to_protein_group.invert
51
+ greedy_first = protein_group_to_peptides.sort_by(&sort_by).reverse
52
+
53
+ accounted_for = Set.new
54
+ # we are discarding the subsumed sets, but we could get them with
55
+ # partition
56
+ greedy_first.select! do |group, peptide_set|
57
+ has_an_unaccounted_peptide = false
58
+ peptide_set.each do |peptide_hit|
59
+ unless accounted_for.include?(peptide_hit)
60
+ has_an_unaccounted_peptide = true
61
+ accounted_for.add(peptide_hit)
62
+ end
63
+ end
64
+ group.peptide_hits = peptide_set if has_an_unaccounted_peptide
65
+ has_an_unaccounted_peptide
66
+ end
67
+ greedy_first.map(&:first)
68
+ end
69
+
70
+ end
71
+ end
72
+ end
@@ -0,0 +1,17 @@
1
+
2
+ module Ms ; end
3
+ module Ms::Ident ; end
4
+
5
+ class Ms::Ident::ProteinHit
6
+ attr_accessor :id
7
+ attr_accessor :seq
8
+ alias_method :sequence, :seq
9
+ alias_method :sequence=, :seq=
10
+ attr_accessor :peptide_hits
11
+
12
+ def initialize(id=nil)
13
+ @peptide_hits = []
14
+ @id = id
15
+ end
16
+ end
17
+
@@ -82,6 +82,12 @@ describe 'reading a peptide centric database' do
82
82
  outfiles = Ms::Ident::Peptide::Db.cmdline([FASTA_FILE])
83
83
  @outfile = outfiles.first
84
84
 
85
+ it 'creates a hash that can retrieve peptides as an array' do
86
+ hash = Ms::Ident::Peptide::Db.new(@outfile)
87
+ hash["AVTEQGHELSNEER"].enums %w(sp|P31946|1433B_HUMAN sp|P31946-2|1433B_HUMAN)
88
+ hash["VRAAR"].enums ["tr|D3DX18|D3DX18_HUMAN"]
89
+ end
90
+
85
91
  it 'reads the file on disk with random access or is enumerable' do
86
92
  Ms::Ident::Peptide::Db::IO.open(@outfile) do |io|
87
93
  io["AVTEQGHELSNEER"].enums %w(sp|P31946|1433B_HUMAN sp|P31946-2|1433B_HUMAN)
@@ -1,6 +1,6 @@
1
1
  require 'spec_helper'
2
2
 
3
- require 'ms/ident/protein'
3
+ require 'ms/ident/protein_group'
4
4
 
5
5
  PeptideHit = Struct.new(:aaseq, :charge, :proteins) do
6
6
  def inspect # easier to read output
@@ -36,22 +36,21 @@ describe 'creating minimal protein groups from peptide hits' do
36
36
  it 'is a greedy algorithm' do
37
37
  @prot_hits.each {|prthit| @prot_hits_hash[prthit.id].each {|pep| pep.proteins << prthit } }
38
38
  # big_guy has all the peptides, so it takes them all
39
- reply = Ms::Ident::Protein.peptide_hits_to_protein_groups(@pep_hits)
40
- reply.first.size.is 2 # the group and the peptide set
41
- reply.first.first.size.is 1 # the group
42
- reply.first.first.first.id.is 'big_guy'
39
+ protein_groups = Ms::Ident::ProteinGroup.peptide_hits_to_protein_groups(@pep_hits)
40
+ protein_groups.first.size.is 1# the group
41
+ protein_groups.first.first.id.is 'big_guy'
43
42
  end
44
43
 
45
44
  it 'removes proteins accounted for only as little pieces of larger proteins' do
46
45
  @prot_hits[1..-1].each {|prthit| @prot_hits_hash[prthit.id].each {|pep| pep.proteins << prthit } }
47
- reply = Ms::Ident::Protein.peptide_hits_to_protein_groups(@pep_hits)
46
+ protein_groups = Ms::Ident::ProteinGroup.peptide_hits_to_protein_groups(@pep_hits)
48
47
  # no subsumed_by_medium
49
- reply.map(&:first).any? {|protein_list| protein_list.any? {|v| v.id == 'subsumed_by_medium' }}.is false
48
+ protein_groups.any? {|prot_group| prot_group.any? {|v| v.id == 'subsumed_by_medium' }}.is false
50
49
  end
51
50
 
52
51
  it 'allows alternate sorting algorithms for greediness' do
53
52
  @prot_hits.each {|prthit| @prot_hits_hash[prthit.id].each {|pep| pep.proteins << prthit } }
54
- reply = Ms::Ident::Protein.peptide_hits_to_protein_groups(@pep_hits) do |prot_and_peptide_hits|
53
+ prot_groups = Ms::Ident::ProteinGroup.peptide_hits_to_protein_groups(@pep_hits) do |prot_and_peptide_hits|
55
54
  # deliberate using a counterintuitive sorting method to give little guys
56
55
  # a chance
57
56
  -prot_and_peptide_hits.last.size
@@ -61,7 +60,7 @@ describe 'creating minimal protein groups from peptide hits' do
61
60
  # to add to the mix. This demonstrates how proteins can be weighted in
62
61
  # different ways based on their peptide hits.
63
62
  seen = []
64
- reply.each {|pair| pair.first.each {|prot| seen << prot.id } }
63
+ prot_groups.each {|pg| pg.each {|prot| seen << prot.id } }
65
64
  # big guy is completely accounted for in the now prioritized little guy
66
65
  # and medium guys, etc.
67
66
  seen.sort.is @prot_hits_hash.keys[1..-1].sort
metadata CHANGED
@@ -5,8 +5,8 @@ version: !ruby/object:Gem::Version
5
5
  segments:
6
6
  - 0
7
7
  - 0
8
- - 18
9
- version: 0.0.18
8
+ - 19
9
+ version: 0.0.19
10
10
  platform: ruby
11
11
  authors:
12
12
  - John T. Prince
@@ -14,7 +14,7 @@ autorequire:
14
14
  bindir: bin
15
15
  cert_chain: []
16
16
 
17
- date: 2011-03-28 00:00:00 -06:00
17
+ date: 2011-03-30 00:00:00 -06:00
18
18
  default_executable:
19
19
  dependencies:
20
20
  - !ruby/object:Gem::Dependency
@@ -97,7 +97,7 @@ dependencies:
97
97
  version: "0"
98
98
  type: :development
99
99
  version_requirements: *id006
100
- description: mspire library for working with mzIdentML and pepxml
100
+ description: mspire library for working with mzIdentML, pepxml, and related.
101
101
  email: jtprince@gmail.com
102
102
  executables: []
103
103
 
@@ -116,6 +116,8 @@ files:
116
116
  - lib/ms/ident.rb
117
117
  - lib/ms/ident/peptide.rb
118
118
  - lib/ms/ident/peptide/db.rb
119
+ - lib/ms/ident/peptide_hit.rb
120
+ - lib/ms/ident/peptide_hit/qvalue.rb
119
121
  - lib/ms/ident/pepxml.rb
120
122
  - lib/ms/ident/pepxml/modifications.rb
121
123
  - lib/ms/ident/pepxml/msms_pipeline_analysis.rb
@@ -131,6 +133,8 @@ files:
131
133
  - lib/ms/ident/pepxml/search_summary.rb
132
134
  - lib/ms/ident/pepxml/spectrum_query.rb
133
135
  - lib/ms/ident/protein.rb
136
+ - lib/ms/ident/protein_group.rb
137
+ - lib/ms/ident/protein_hit.rb
134
138
  - lib/ms/ident/search.rb
135
139
  - schema/pepXML_v115.xsd
136
140
  - schema/pepXML_v19.xsd
@@ -138,7 +142,7 @@ files:
138
142
  - spec/ms/ident/pepxml/sample_enzyme_spec.rb
139
143
  - spec/ms/ident/pepxml/search_hit/modification_info_spec.rb
140
144
  - spec/ms/ident/pepxml_spec.rb
141
- - spec/ms/ident/protein_spec.rb
145
+ - spec/ms/ident/protein_group_spec.rb
142
146
  - spec/spec_helper.rb
143
147
  - spec/tfiles/ms/ident/peptide/db/uni_11_sp_tr.fasta
144
148
  - spec/tfiles/ms/ident/peptide/db/uni_11_sp_tr.msd_clvg2.min_aaseq4.yml
@@ -179,5 +183,5 @@ test_files:
179
183
  - spec/ms/ident/pepxml/sample_enzyme_spec.rb
180
184
  - spec/ms/ident/pepxml/search_hit/modification_info_spec.rb
181
185
  - spec/ms/ident/pepxml_spec.rb
182
- - spec/ms/ident/protein_spec.rb
186
+ - spec/ms/ident/protein_group_spec.rb
183
187
  - spec/spec_helper.rb