ms-ident 0.0.18 → 0.0.19

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/Rakefile CHANGED
@@ -8,7 +8,7 @@ Jeweler::Tasks.new do |gem|
8
8
  gem.homepage = "http://github.com/jtprince/ms-ident"
9
9
  gem.license = "MIT"
10
10
  gem.summary = %Q{mspire library for working with mzIdentML and pepxml}
11
- gem.description = %Q{mspire library for working with mzIdentML and pepxml}
11
+ gem.description = %Q{mspire library for working with mzIdentML, pepxml, and related.}
12
12
  gem.email = "jtprince@gmail.com"
13
13
  gem.authors = ["John T. Prince"]
14
14
  gem.rubyforge_project = 'mspire'
data/VERSION CHANGED
@@ -1 +1 @@
1
- 0.0.18
1
+ 0.0.19
@@ -5,7 +5,11 @@ module Ms ; end
5
5
  module Ms::Ident ; end
6
6
  module Ms::Ident::Peptide ; end
7
7
 
8
- module Ms::Ident::Peptide::Db
8
+ # the object itself is a modified Hash.
9
+ # It is initialized with the database file and a protein array can be
10
+ # retrieved with the #[] method given an amino acid sequence. All other
11
+ # methods are untested at this time and should be avoided!
12
+ class Ms::Ident::Peptide::Db < Hash
9
13
  MAX_NUM_AA_EXPANSION = 3
10
14
 
11
15
  # the twenty standard amino acids
@@ -165,12 +169,24 @@ module Ms::Ident::Peptide::Db
165
169
  to_expand
166
170
  end
167
171
 
172
+ def initialize(db_file)
173
+ self.replace(YAML.load_file(db_file))
174
+ end
175
+
176
+ alias_method :old_bracket, '[]'.to_sym
177
+
178
+ # returns the protein id's as an array
179
+ def [](key)
180
+ old_bracket(key).chomp.split(PROTEIN_DELIMITER)
181
+ end
182
+
168
183
  # an object for on disk retrieval of db entries
169
184
  # proteins are returned as an array.
170
185
  # behaves much like a hash once it is opened.
171
186
  class IO
172
187
  include Enumerable
173
188
  def self.open(filename, &block)
189
+ raise ArgumentError unless block
174
190
  File.open(filename) do |io|
175
191
  block.call(self.new(io))
176
192
  end
@@ -192,9 +208,11 @@ module Ms::Ident::Peptide::Db
192
208
  @index[key] = [start, end_pos-start]
193
209
  end
194
210
  end
211
+
195
212
  # returns an array of proteins for the given key (peptide aaseq)
196
213
  def [](key)
197
214
  (start, length) = @index[key]
215
+ return nil unless start
198
216
  @io.seek(start)
199
217
  string = @io.read(length)
200
218
  string.chomp!
@@ -0,0 +1,55 @@
1
+ require 'ms/ident/peptide_hit'
2
+
3
+ module Ms ; end
4
+ module Ms::Ident ; end
5
+
6
+ class Ms::Ident::PeptideHit
7
+ module Qvalue
8
+ attr_accessor :qvalue
9
+ FILE_EXTENSION = '.phq.tsv'
10
+ FILE_DELIMITER = "\t"
11
+ HEADER = %w(aaseq charge qvalue)
12
+
13
+ class << self
14
+
15
+ # writes to the file, adding an extension
16
+ def to_phq(base, hits, qvalues=nil)
17
+ to_file(base + FILE_EXTENSION, hits)
18
+ end
19
+
20
+ # writes the peptide hits to a phq.tsv file. qvalues is a parallel array
21
+ # to hits that can provide qvalues if not inherent to the hits
22
+ # returns the filename.
23
+ def to_file(filename, hits, qvalues=[])
24
+ File.open(filename,'w') do |out|
25
+ out.puts HEADER.join(FILE_DELIMITER)
26
+ hits.zip(qvalues) do |hit, qvalue|
27
+ out.puts [hit.aaseq, hit.charge, qvalue || hit.qvalue].join(FILE_DELIMITER)
28
+ end
29
+ end
30
+ filename
31
+ end
32
+
33
+ # returns an array of PeptideHit objects from a phq.tsv
34
+ def from_file(filename)
35
+ peptide_hits = []
36
+ File.open(filename) do |io|
37
+ header = io.readline.chomp.split(FILE_DELIMITER)
38
+ raise "bad headers" unless header == HEADER
39
+ io.each do |line|
40
+ line.chomp!
41
+ (aaseq, charge, qvalue) = line.split(FILE_DELIMITER)
42
+ ph = Ms::Ident::PeptideHit.new
43
+ ph.aaseq = aaseq ; ph.charge = charge.to_i ; ph.qvalue = qvalue.to_f
44
+ peptide_hits << ph
45
+ end
46
+ end
47
+ peptide_hits
48
+ end
49
+
50
+ alias_method :from_phq, :from_file
51
+
52
+ end
53
+ end # Qvalue
54
+ include Qvalue
55
+ end # Peptide Hit
@@ -0,0 +1,8 @@
1
+ module Ms ; end
2
+ module Ms::Ident ; end
3
+
4
+ class Ms::Ident::PeptideHit
5
+ attr_accessor :aaseq
6
+ attr_accessor :charge
7
+ attr_accessor :proteins
8
+ end
@@ -1,70 +1,12 @@
1
1
  module Ms ; end
2
2
  module Ms::Ident ; end
3
3
 
4
- require 'set'
5
4
 
6
5
  module Ms::Ident::Protein
7
-
8
- class << self
9
- end
10
-
11
6
  # gives the information up until the first space or carriage return.
12
7
  # Assumes the protein can respond_to? :reference
13
8
  def first_entry
14
9
  reference.split(/[\s\r]/)[0]
15
10
  end
16
-
17
- PRIORITIZE_PROTEINS = lambda do |protein_group_and_peptide_hits|
18
- peptide_hits = protein_group_and_peptide_hits.last
19
- num_uniq_aaseqs = peptide_hits.map {|hit| hit.aaseq }.uniq.size
20
- num_uniq_aaseqs_at_z = peptide_hits.map {|hit| [hit.aaseq, hit.charge] }.uniq.size
21
- [num_uniq_aaseqs, num_uniq_aaseqs_at_z, peptide_hits.size]
22
- end
23
-
24
-
25
- module_function
26
- # greedy algorithm to map a set of peptide_hits to protein groups. each
27
- # peptide hit should respond to :aaseq, :charge, :proteins if a block is
28
- # given, yields a single argument: a doublet of protein_group and peptide
29
- # set. It expects a metric or array to sort by for creating greedy protein
30
- # groups (the greediest proteins should sort to the back of the array). if
31
- # no block is given, the groups are sorted by [# uniq aaseqs, # uniq
32
- # aaseq+charge, # peptide_hits] (see PRIORITIZE_PROTEINS). Sets of
33
- # peptide_hits and the objects returned by peptide_hit#proteins are used as
34
- # hash keys. As long as each peptide hit has a unique signature (like an
35
- # id) then any object will work. If they are Struct objects, you might
36
- # consider redefining the #hash method to be object_id for performance and
37
- # accuracy.
38
- def peptide_hits_to_protein_groups(peptide_hits, &sort_by)
39
- sort_by ||= PRIORITIZE_PROTEINS
40
- # note to self: I wrote this in 2011, so I think I know what I'm doing now
41
- protein_to_peptides = Hash.new {|h,k| h[k] = Set.new }
42
- peptide_hits.each do |peptide_hit|
43
- peptide_hit.proteins.each do |protein|
44
- protein_to_peptides[protein] << peptide_hit
45
- end
46
- end
47
- peptides_to_protein_group = Hash.new {|h,k| h[k] = [] }
48
- protein_to_peptides.each do |protein, peptide_set|
49
- peptides_to_protein_group[peptide_set] << protein
50
- end
51
- protein_group_to_peptides = peptides_to_protein_group.invert
52
- greedy_first = protein_group_to_peptides.sort_by(&sort_by).reverse
53
- accounted_for = Set.new
54
- surviving_protein_groups = []
55
- # we are discarding the subsumed sets, but we could get them with
56
- # partition
57
- greedy_first.select do |group, peptide_set|
58
- has_an_unaccounted_peptide = false
59
- peptide_set.each do |peptide_hit|
60
- unless accounted_for.include?(peptide_hit)
61
- has_an_unaccounted_peptide = true
62
- accounted_for.add(peptide_hit)
63
- end
64
- end
65
- has_an_unaccounted_peptide
66
- end
67
- end
68
-
69
11
  end
70
12
 
@@ -0,0 +1,72 @@
1
+ require 'set'
2
+
3
+ module Ms
4
+ module Ident
5
+ # represents a group of proteins, typically indistinguishable in the
6
+ # experiment.
7
+ class ProteinGroup < Array
8
+ attr_accessor :peptide_hits
9
+
10
+ PRIORITIZE_PROTEINS = lambda do |protein_group_and_peptide_hits|
11
+ peptide_hits = protein_group_and_peptide_hits.last
12
+ num_uniq_aaseqs = peptide_hits.map {|hit| hit.aaseq }.uniq.size
13
+ num_uniq_aaseqs_at_z = peptide_hits.map {|hit| [hit.aaseq, hit.charge] }.uniq.size
14
+ [num_uniq_aaseqs, num_uniq_aaseqs_at_z, peptide_hits.size]
15
+ end
16
+
17
+ # greedy algorithm to map a set of peptide_hits to protein groups. each
18
+ # peptide hit should respond to :aaseq, :charge, :proteins if a block is
19
+ # given, yields a single argument: a doublet of protein_group and peptide
20
+ # set. It expects a metric or array to sort by for creating greedy protein
21
+ # groups (the greediest proteins should sort to the back of the array). if
22
+ # no block is given, the groups are sorted by [# uniq aaseqs, # uniq
23
+ # aaseq+charge, # peptide_hits] (see PRIORITIZE_PROTEINS). Sets of
24
+ # peptide_hits and the objects returned by peptide_hit#proteins are used as
25
+ # hash keys. As long as each peptide hit has a unique signature (like an
26
+ # id) then any object will work. If they are Struct objects, you might
27
+ # consider redefining the #hash method to be object_id for performance and
28
+ # accuracy.
29
+ #
30
+ # returns an array of ProteinGroup objects, each set with :peptide_hits
31
+ def self.peptide_hits_to_protein_groups(peptide_hits, &sort_by)
32
+ sort_by ||= PRIORITIZE_PROTEINS
33
+ # note to self: I wrote this in 2011, so I think I know what I'm doing now
34
+ protein_to_peptides = Hash.new {|h,k| h[k] = Set.new }
35
+ peptide_hits.each do |peptide_hit|
36
+ peptide_hit.proteins.each do |protein|
37
+ protein_to_peptides[protein] << peptide_hit
38
+ end
39
+ end
40
+ peptides_to_protein_group = Hash.new {|h,k| h[k] = [] }
41
+ protein_to_peptides.each do |protein, peptide_set|
42
+ peptides_to_protein_group[peptide_set] << protein
43
+ end
44
+ peptides_to_protein_group.each do |pephits,ar_of_prots|
45
+ pg = Ms::Ident::ProteinGroup.new(ar_of_prots)
46
+ pg.peptide_hits = pephits
47
+ peptides_to_protein_group[pephits] = pg
48
+ end
49
+
50
+ protein_group_to_peptides = peptides_to_protein_group.invert
51
+ greedy_first = protein_group_to_peptides.sort_by(&sort_by).reverse
52
+
53
+ accounted_for = Set.new
54
+ # we are discarding the subsumed sets, but we could get them with
55
+ # partition
56
+ greedy_first.select! do |group, peptide_set|
57
+ has_an_unaccounted_peptide = false
58
+ peptide_set.each do |peptide_hit|
59
+ unless accounted_for.include?(peptide_hit)
60
+ has_an_unaccounted_peptide = true
61
+ accounted_for.add(peptide_hit)
62
+ end
63
+ end
64
+ group.peptide_hits = peptide_set if has_an_unaccounted_peptide
65
+ has_an_unaccounted_peptide
66
+ end
67
+ greedy_first.map(&:first)
68
+ end
69
+
70
+ end
71
+ end
72
+ end
@@ -0,0 +1,17 @@
1
+
2
+ module Ms ; end
3
+ module Ms::Ident ; end
4
+
5
+ class Ms::Ident::ProteinHit
6
+ attr_accessor :id
7
+ attr_accessor :seq
8
+ alias_method :sequence, :seq
9
+ alias_method :sequence=, :seq=
10
+ attr_accessor :peptide_hits
11
+
12
+ def initialize(id=nil)
13
+ @peptide_hits = []
14
+ @id = id
15
+ end
16
+ end
17
+
@@ -82,6 +82,12 @@ describe 'reading a peptide centric database' do
82
82
  outfiles = Ms::Ident::Peptide::Db.cmdline([FASTA_FILE])
83
83
  @outfile = outfiles.first
84
84
 
85
+ it 'creates a hash that can retrieve peptides as an array' do
86
+ hash = Ms::Ident::Peptide::Db.new(@outfile)
87
+ hash["AVTEQGHELSNEER"].enums %w(sp|P31946|1433B_HUMAN sp|P31946-2|1433B_HUMAN)
88
+ hash["VRAAR"].enums ["tr|D3DX18|D3DX18_HUMAN"]
89
+ end
90
+
85
91
  it 'reads the file on disk with random access or is enumerable' do
86
92
  Ms::Ident::Peptide::Db::IO.open(@outfile) do |io|
87
93
  io["AVTEQGHELSNEER"].enums %w(sp|P31946|1433B_HUMAN sp|P31946-2|1433B_HUMAN)
@@ -1,6 +1,6 @@
1
1
  require 'spec_helper'
2
2
 
3
- require 'ms/ident/protein'
3
+ require 'ms/ident/protein_group'
4
4
 
5
5
  PeptideHit = Struct.new(:aaseq, :charge, :proteins) do
6
6
  def inspect # easier to read output
@@ -36,22 +36,21 @@ describe 'creating minimal protein groups from peptide hits' do
36
36
  it 'is a greedy algorithm' do
37
37
  @prot_hits.each {|prthit| @prot_hits_hash[prthit.id].each {|pep| pep.proteins << prthit } }
38
38
  # big_guy has all the peptides, so it takes them all
39
- reply = Ms::Ident::Protein.peptide_hits_to_protein_groups(@pep_hits)
40
- reply.first.size.is 2 # the group and the peptide set
41
- reply.first.first.size.is 1 # the group
42
- reply.first.first.first.id.is 'big_guy'
39
+ protein_groups = Ms::Ident::ProteinGroup.peptide_hits_to_protein_groups(@pep_hits)
40
+ protein_groups.first.size.is 1# the group
41
+ protein_groups.first.first.id.is 'big_guy'
43
42
  end
44
43
 
45
44
  it 'removes proteins accounted for only as little pieces of larger proteins' do
46
45
  @prot_hits[1..-1].each {|prthit| @prot_hits_hash[prthit.id].each {|pep| pep.proteins << prthit } }
47
- reply = Ms::Ident::Protein.peptide_hits_to_protein_groups(@pep_hits)
46
+ protein_groups = Ms::Ident::ProteinGroup.peptide_hits_to_protein_groups(@pep_hits)
48
47
  # no subsumed_by_medium
49
- reply.map(&:first).any? {|protein_list| protein_list.any? {|v| v.id == 'subsumed_by_medium' }}.is false
48
+ protein_groups.any? {|prot_group| prot_group.any? {|v| v.id == 'subsumed_by_medium' }}.is false
50
49
  end
51
50
 
52
51
  it 'allows alternate sorting algorithms for greediness' do
53
52
  @prot_hits.each {|prthit| @prot_hits_hash[prthit.id].each {|pep| pep.proteins << prthit } }
54
- reply = Ms::Ident::Protein.peptide_hits_to_protein_groups(@pep_hits) do |prot_and_peptide_hits|
53
+ prot_groups = Ms::Ident::ProteinGroup.peptide_hits_to_protein_groups(@pep_hits) do |prot_and_peptide_hits|
55
54
  # deliberate using a counterintuitive sorting method to give little guys
56
55
  # a chance
57
56
  -prot_and_peptide_hits.last.size
@@ -61,7 +60,7 @@ describe 'creating minimal protein groups from peptide hits' do
61
60
  # to add to the mix. This demonstrates how proteins can be weighted in
62
61
  # different ways based on their peptide hits.
63
62
  seen = []
64
- reply.each {|pair| pair.first.each {|prot| seen << prot.id } }
63
+ prot_groups.each {|pg| pg.each {|prot| seen << prot.id } }
65
64
  # big guy is completely accounted for in the now prioritized little guy
66
65
  # and medium guys, etc.
67
66
  seen.sort.is @prot_hits_hash.keys[1..-1].sort
metadata CHANGED
@@ -5,8 +5,8 @@ version: !ruby/object:Gem::Version
5
5
  segments:
6
6
  - 0
7
7
  - 0
8
- - 18
9
- version: 0.0.18
8
+ - 19
9
+ version: 0.0.19
10
10
  platform: ruby
11
11
  authors:
12
12
  - John T. Prince
@@ -14,7 +14,7 @@ autorequire:
14
14
  bindir: bin
15
15
  cert_chain: []
16
16
 
17
- date: 2011-03-28 00:00:00 -06:00
17
+ date: 2011-03-30 00:00:00 -06:00
18
18
  default_executable:
19
19
  dependencies:
20
20
  - !ruby/object:Gem::Dependency
@@ -97,7 +97,7 @@ dependencies:
97
97
  version: "0"
98
98
  type: :development
99
99
  version_requirements: *id006
100
- description: mspire library for working with mzIdentML and pepxml
100
+ description: mspire library for working with mzIdentML, pepxml, and related.
101
101
  email: jtprince@gmail.com
102
102
  executables: []
103
103
 
@@ -116,6 +116,8 @@ files:
116
116
  - lib/ms/ident.rb
117
117
  - lib/ms/ident/peptide.rb
118
118
  - lib/ms/ident/peptide/db.rb
119
+ - lib/ms/ident/peptide_hit.rb
120
+ - lib/ms/ident/peptide_hit/qvalue.rb
119
121
  - lib/ms/ident/pepxml.rb
120
122
  - lib/ms/ident/pepxml/modifications.rb
121
123
  - lib/ms/ident/pepxml/msms_pipeline_analysis.rb
@@ -131,6 +133,8 @@ files:
131
133
  - lib/ms/ident/pepxml/search_summary.rb
132
134
  - lib/ms/ident/pepxml/spectrum_query.rb
133
135
  - lib/ms/ident/protein.rb
136
+ - lib/ms/ident/protein_group.rb
137
+ - lib/ms/ident/protein_hit.rb
134
138
  - lib/ms/ident/search.rb
135
139
  - schema/pepXML_v115.xsd
136
140
  - schema/pepXML_v19.xsd
@@ -138,7 +142,7 @@ files:
138
142
  - spec/ms/ident/pepxml/sample_enzyme_spec.rb
139
143
  - spec/ms/ident/pepxml/search_hit/modification_info_spec.rb
140
144
  - spec/ms/ident/pepxml_spec.rb
141
- - spec/ms/ident/protein_spec.rb
145
+ - spec/ms/ident/protein_group_spec.rb
142
146
  - spec/spec_helper.rb
143
147
  - spec/tfiles/ms/ident/peptide/db/uni_11_sp_tr.fasta
144
148
  - spec/tfiles/ms/ident/peptide/db/uni_11_sp_tr.msd_clvg2.min_aaseq4.yml
@@ -179,5 +183,5 @@ test_files:
179
183
  - spec/ms/ident/pepxml/sample_enzyme_spec.rb
180
184
  - spec/ms/ident/pepxml/search_hit/modification_info_spec.rb
181
185
  - spec/ms/ident/pepxml_spec.rb
182
- - spec/ms/ident/protein_spec.rb
186
+ - spec/ms/ident/protein_group_spec.rb
183
187
  - spec/spec_helper.rb