ms-ident 0.0.18 → 0.0.19
Sign up to get free protection for your applications and to get access to all the features.
- data/Rakefile +1 -1
- data/VERSION +1 -1
- data/lib/ms/ident/peptide/db.rb +19 -1
- data/lib/ms/ident/peptide_hit/qvalue.rb +55 -0
- data/lib/ms/ident/peptide_hit.rb +8 -0
- data/lib/ms/ident/protein.rb +0 -58
- data/lib/ms/ident/protein_group.rb +72 -0
- data/lib/ms/ident/protein_hit.rb +17 -0
- data/spec/ms/ident/peptide/db_spec.rb +6 -0
- data/spec/ms/ident/{protein_spec.rb → protein_group_spec.rb} +8 -9
- metadata +10 -6
data/Rakefile
CHANGED
@@ -8,7 +8,7 @@ Jeweler::Tasks.new do |gem|
|
|
8
8
|
gem.homepage = "http://github.com/jtprince/ms-ident"
|
9
9
|
gem.license = "MIT"
|
10
10
|
gem.summary = %Q{mspire library for working with mzIdentML and pepxml}
|
11
|
-
gem.description = %Q{mspire library for working with mzIdentML and
|
11
|
+
gem.description = %Q{mspire library for working with mzIdentML, pepxml, and related.}
|
12
12
|
gem.email = "jtprince@gmail.com"
|
13
13
|
gem.authors = ["John T. Prince"]
|
14
14
|
gem.rubyforge_project = 'mspire'
|
data/VERSION
CHANGED
@@ -1 +1 @@
|
|
1
|
-
0.0.
|
1
|
+
0.0.19
|
data/lib/ms/ident/peptide/db.rb
CHANGED
@@ -5,7 +5,11 @@ module Ms ; end
|
|
5
5
|
module Ms::Ident ; end
|
6
6
|
module Ms::Ident::Peptide ; end
|
7
7
|
|
8
|
-
|
8
|
+
# the object itself is a modified Hash.
|
9
|
+
# It is initialized with the database file and a protein array can be
|
10
|
+
# retrieved with the #[] method given an amino acid sequence. All other
|
11
|
+
# methods are untested at this time and should be avoided!
|
12
|
+
class Ms::Ident::Peptide::Db < Hash
|
9
13
|
MAX_NUM_AA_EXPANSION = 3
|
10
14
|
|
11
15
|
# the twenty standard amino acids
|
@@ -165,12 +169,24 @@ module Ms::Ident::Peptide::Db
|
|
165
169
|
to_expand
|
166
170
|
end
|
167
171
|
|
172
|
+
def initialize(db_file)
|
173
|
+
self.replace(YAML.load_file(db_file))
|
174
|
+
end
|
175
|
+
|
176
|
+
alias_method :old_bracket, '[]'.to_sym
|
177
|
+
|
178
|
+
# returns the protein id's as an array
|
179
|
+
def [](key)
|
180
|
+
old_bracket(key).chomp.split(PROTEIN_DELIMITER)
|
181
|
+
end
|
182
|
+
|
168
183
|
# an object for on disk retrieval of db entries
|
169
184
|
# proteins are returned as an array.
|
170
185
|
# behaves much like a hash once it is opened.
|
171
186
|
class IO
|
172
187
|
include Enumerable
|
173
188
|
def self.open(filename, &block)
|
189
|
+
raise ArgumentError unless block
|
174
190
|
File.open(filename) do |io|
|
175
191
|
block.call(self.new(io))
|
176
192
|
end
|
@@ -192,9 +208,11 @@ module Ms::Ident::Peptide::Db
|
|
192
208
|
@index[key] = [start, end_pos-start]
|
193
209
|
end
|
194
210
|
end
|
211
|
+
|
195
212
|
# returns an array of proteins for the given key (peptide aaseq)
|
196
213
|
def [](key)
|
197
214
|
(start, length) = @index[key]
|
215
|
+
return nil unless start
|
198
216
|
@io.seek(start)
|
199
217
|
string = @io.read(length)
|
200
218
|
string.chomp!
|
@@ -0,0 +1,55 @@
|
|
1
|
+
require 'ms/ident/peptide_hit'
|
2
|
+
|
3
|
+
module Ms ; end
|
4
|
+
module Ms::Ident ; end
|
5
|
+
|
6
|
+
class Ms::Ident::PeptideHit
|
7
|
+
module Qvalue
|
8
|
+
attr_accessor :qvalue
|
9
|
+
FILE_EXTENSION = '.phq.tsv'
|
10
|
+
FILE_DELIMITER = "\t"
|
11
|
+
HEADER = %w(aaseq charge qvalue)
|
12
|
+
|
13
|
+
class << self
|
14
|
+
|
15
|
+
# writes to the file, adding an extension
|
16
|
+
def to_phq(base, hits, qvalues=nil)
|
17
|
+
to_file(base + FILE_EXTENSION, hits)
|
18
|
+
end
|
19
|
+
|
20
|
+
# writes the peptide hits to a phq.tsv file. qvalues is a parallel array
|
21
|
+
# to hits that can provide qvalues if not inherent to the hits
|
22
|
+
# returns the filename.
|
23
|
+
def to_file(filename, hits, qvalues=[])
|
24
|
+
File.open(filename,'w') do |out|
|
25
|
+
out.puts HEADER.join(FILE_DELIMITER)
|
26
|
+
hits.zip(qvalues) do |hit, qvalue|
|
27
|
+
out.puts [hit.aaseq, hit.charge, qvalue || hit.qvalue].join(FILE_DELIMITER)
|
28
|
+
end
|
29
|
+
end
|
30
|
+
filename
|
31
|
+
end
|
32
|
+
|
33
|
+
# returns an array of PeptideHit objects from a phq.tsv
|
34
|
+
def from_file(filename)
|
35
|
+
peptide_hits = []
|
36
|
+
File.open(filename) do |io|
|
37
|
+
header = io.readline.chomp.split(FILE_DELIMITER)
|
38
|
+
raise "bad headers" unless header == HEADER
|
39
|
+
io.each do |line|
|
40
|
+
line.chomp!
|
41
|
+
(aaseq, charge, qvalue) = line.split(FILE_DELIMITER)
|
42
|
+
ph = Ms::Ident::PeptideHit.new
|
43
|
+
ph.aaseq = aaseq ; ph.charge = charge.to_i ; ph.qvalue = qvalue.to_f
|
44
|
+
peptide_hits << ph
|
45
|
+
end
|
46
|
+
end
|
47
|
+
peptide_hits
|
48
|
+
end
|
49
|
+
|
50
|
+
alias_method :from_phq, :from_file
|
51
|
+
|
52
|
+
end
|
53
|
+
end # Qvalue
|
54
|
+
include Qvalue
|
55
|
+
end # Peptide Hit
|
data/lib/ms/ident/protein.rb
CHANGED
@@ -1,70 +1,12 @@
|
|
1
1
|
module Ms ; end
|
2
2
|
module Ms::Ident ; end
|
3
3
|
|
4
|
-
require 'set'
|
5
4
|
|
6
5
|
module Ms::Ident::Protein
|
7
|
-
|
8
|
-
class << self
|
9
|
-
end
|
10
|
-
|
11
6
|
# gives the information up until the first space or carriage return.
|
12
7
|
# Assumes the protein can respond_to? :reference
|
13
8
|
def first_entry
|
14
9
|
reference.split(/[\s\r]/)[0]
|
15
10
|
end
|
16
|
-
|
17
|
-
PRIORITIZE_PROTEINS = lambda do |protein_group_and_peptide_hits|
|
18
|
-
peptide_hits = protein_group_and_peptide_hits.last
|
19
|
-
num_uniq_aaseqs = peptide_hits.map {|hit| hit.aaseq }.uniq.size
|
20
|
-
num_uniq_aaseqs_at_z = peptide_hits.map {|hit| [hit.aaseq, hit.charge] }.uniq.size
|
21
|
-
[num_uniq_aaseqs, num_uniq_aaseqs_at_z, peptide_hits.size]
|
22
|
-
end
|
23
|
-
|
24
|
-
|
25
|
-
module_function
|
26
|
-
# greedy algorithm to map a set of peptide_hits to protein groups. each
|
27
|
-
# peptide hit should respond to :aaseq, :charge, :proteins if a block is
|
28
|
-
# given, yields a single argument: a doublet of protein_group and peptide
|
29
|
-
# set. It expects a metric or array to sort by for creating greedy protein
|
30
|
-
# groups (the greediest proteins should sort to the back of the array). if
|
31
|
-
# no block is given, the groups are sorted by [# uniq aaseqs, # uniq
|
32
|
-
# aaseq+charge, # peptide_hits] (see PRIORITIZE_PROTEINS). Sets of
|
33
|
-
# peptide_hits and the objects returned by peptide_hit#proteins are used as
|
34
|
-
# hash keys. As long as each peptide hit has a unique signature (like an
|
35
|
-
# id) then any object will work. If they are Struct objects, you might
|
36
|
-
# consider redefining the #hash method to be object_id for performance and
|
37
|
-
# accuracy.
|
38
|
-
def peptide_hits_to_protein_groups(peptide_hits, &sort_by)
|
39
|
-
sort_by ||= PRIORITIZE_PROTEINS
|
40
|
-
# note to self: I wrote this in 2011, so I think I know what I'm doing now
|
41
|
-
protein_to_peptides = Hash.new {|h,k| h[k] = Set.new }
|
42
|
-
peptide_hits.each do |peptide_hit|
|
43
|
-
peptide_hit.proteins.each do |protein|
|
44
|
-
protein_to_peptides[protein] << peptide_hit
|
45
|
-
end
|
46
|
-
end
|
47
|
-
peptides_to_protein_group = Hash.new {|h,k| h[k] = [] }
|
48
|
-
protein_to_peptides.each do |protein, peptide_set|
|
49
|
-
peptides_to_protein_group[peptide_set] << protein
|
50
|
-
end
|
51
|
-
protein_group_to_peptides = peptides_to_protein_group.invert
|
52
|
-
greedy_first = protein_group_to_peptides.sort_by(&sort_by).reverse
|
53
|
-
accounted_for = Set.new
|
54
|
-
surviving_protein_groups = []
|
55
|
-
# we are discarding the subsumed sets, but we could get them with
|
56
|
-
# partition
|
57
|
-
greedy_first.select do |group, peptide_set|
|
58
|
-
has_an_unaccounted_peptide = false
|
59
|
-
peptide_set.each do |peptide_hit|
|
60
|
-
unless accounted_for.include?(peptide_hit)
|
61
|
-
has_an_unaccounted_peptide = true
|
62
|
-
accounted_for.add(peptide_hit)
|
63
|
-
end
|
64
|
-
end
|
65
|
-
has_an_unaccounted_peptide
|
66
|
-
end
|
67
|
-
end
|
68
|
-
|
69
11
|
end
|
70
12
|
|
@@ -0,0 +1,72 @@
|
|
1
|
+
require 'set'
|
2
|
+
|
3
|
+
module Ms
|
4
|
+
module Ident
|
5
|
+
# represents a group of proteins, typically indistinguishable in the
|
6
|
+
# experiment.
|
7
|
+
class ProteinGroup < Array
|
8
|
+
attr_accessor :peptide_hits
|
9
|
+
|
10
|
+
PRIORITIZE_PROTEINS = lambda do |protein_group_and_peptide_hits|
|
11
|
+
peptide_hits = protein_group_and_peptide_hits.last
|
12
|
+
num_uniq_aaseqs = peptide_hits.map {|hit| hit.aaseq }.uniq.size
|
13
|
+
num_uniq_aaseqs_at_z = peptide_hits.map {|hit| [hit.aaseq, hit.charge] }.uniq.size
|
14
|
+
[num_uniq_aaseqs, num_uniq_aaseqs_at_z, peptide_hits.size]
|
15
|
+
end
|
16
|
+
|
17
|
+
# greedy algorithm to map a set of peptide_hits to protein groups. each
|
18
|
+
# peptide hit should respond to :aaseq, :charge, :proteins if a block is
|
19
|
+
# given, yields a single argument: a doublet of protein_group and peptide
|
20
|
+
# set. It expects a metric or array to sort by for creating greedy protein
|
21
|
+
# groups (the greediest proteins should sort to the back of the array). if
|
22
|
+
# no block is given, the groups are sorted by [# uniq aaseqs, # uniq
|
23
|
+
# aaseq+charge, # peptide_hits] (see PRIORITIZE_PROTEINS). Sets of
|
24
|
+
# peptide_hits and the objects returned by peptide_hit#proteins are used as
|
25
|
+
# hash keys. As long as each peptide hit has a unique signature (like an
|
26
|
+
# id) then any object will work. If they are Struct objects, you might
|
27
|
+
# consider redefining the #hash method to be object_id for performance and
|
28
|
+
# accuracy.
|
29
|
+
#
|
30
|
+
# returns an array of ProteinGroup objects, each set with :peptide_hits
|
31
|
+
def self.peptide_hits_to_protein_groups(peptide_hits, &sort_by)
|
32
|
+
sort_by ||= PRIORITIZE_PROTEINS
|
33
|
+
# note to self: I wrote this in 2011, so I think I know what I'm doing now
|
34
|
+
protein_to_peptides = Hash.new {|h,k| h[k] = Set.new }
|
35
|
+
peptide_hits.each do |peptide_hit|
|
36
|
+
peptide_hit.proteins.each do |protein|
|
37
|
+
protein_to_peptides[protein] << peptide_hit
|
38
|
+
end
|
39
|
+
end
|
40
|
+
peptides_to_protein_group = Hash.new {|h,k| h[k] = [] }
|
41
|
+
protein_to_peptides.each do |protein, peptide_set|
|
42
|
+
peptides_to_protein_group[peptide_set] << protein
|
43
|
+
end
|
44
|
+
peptides_to_protein_group.each do |pephits,ar_of_prots|
|
45
|
+
pg = Ms::Ident::ProteinGroup.new(ar_of_prots)
|
46
|
+
pg.peptide_hits = pephits
|
47
|
+
peptides_to_protein_group[pephits] = pg
|
48
|
+
end
|
49
|
+
|
50
|
+
protein_group_to_peptides = peptides_to_protein_group.invert
|
51
|
+
greedy_first = protein_group_to_peptides.sort_by(&sort_by).reverse
|
52
|
+
|
53
|
+
accounted_for = Set.new
|
54
|
+
# we are discarding the subsumed sets, but we could get them with
|
55
|
+
# partition
|
56
|
+
greedy_first.select! do |group, peptide_set|
|
57
|
+
has_an_unaccounted_peptide = false
|
58
|
+
peptide_set.each do |peptide_hit|
|
59
|
+
unless accounted_for.include?(peptide_hit)
|
60
|
+
has_an_unaccounted_peptide = true
|
61
|
+
accounted_for.add(peptide_hit)
|
62
|
+
end
|
63
|
+
end
|
64
|
+
group.peptide_hits = peptide_set if has_an_unaccounted_peptide
|
65
|
+
has_an_unaccounted_peptide
|
66
|
+
end
|
67
|
+
greedy_first.map(&:first)
|
68
|
+
end
|
69
|
+
|
70
|
+
end
|
71
|
+
end
|
72
|
+
end
|
@@ -0,0 +1,17 @@
|
|
1
|
+
|
2
|
+
module Ms ; end
|
3
|
+
module Ms::Ident ; end
|
4
|
+
|
5
|
+
class Ms::Ident::ProteinHit
|
6
|
+
attr_accessor :id
|
7
|
+
attr_accessor :seq
|
8
|
+
alias_method :sequence, :seq
|
9
|
+
alias_method :sequence=, :seq=
|
10
|
+
attr_accessor :peptide_hits
|
11
|
+
|
12
|
+
def initialize(id=nil)
|
13
|
+
@peptide_hits = []
|
14
|
+
@id = id
|
15
|
+
end
|
16
|
+
end
|
17
|
+
|
@@ -82,6 +82,12 @@ describe 'reading a peptide centric database' do
|
|
82
82
|
outfiles = Ms::Ident::Peptide::Db.cmdline([FASTA_FILE])
|
83
83
|
@outfile = outfiles.first
|
84
84
|
|
85
|
+
it 'creates a hash that can retrieve peptides as an array' do
|
86
|
+
hash = Ms::Ident::Peptide::Db.new(@outfile)
|
87
|
+
hash["AVTEQGHELSNEER"].enums %w(sp|P31946|1433B_HUMAN sp|P31946-2|1433B_HUMAN)
|
88
|
+
hash["VRAAR"].enums ["tr|D3DX18|D3DX18_HUMAN"]
|
89
|
+
end
|
90
|
+
|
85
91
|
it 'reads the file on disk with random access or is enumerable' do
|
86
92
|
Ms::Ident::Peptide::Db::IO.open(@outfile) do |io|
|
87
93
|
io["AVTEQGHELSNEER"].enums %w(sp|P31946|1433B_HUMAN sp|P31946-2|1433B_HUMAN)
|
@@ -1,6 +1,6 @@
|
|
1
1
|
require 'spec_helper'
|
2
2
|
|
3
|
-
require 'ms/ident/
|
3
|
+
require 'ms/ident/protein_group'
|
4
4
|
|
5
5
|
PeptideHit = Struct.new(:aaseq, :charge, :proteins) do
|
6
6
|
def inspect # easier to read output
|
@@ -36,22 +36,21 @@ describe 'creating minimal protein groups from peptide hits' do
|
|
36
36
|
it 'is a greedy algorithm' do
|
37
37
|
@prot_hits.each {|prthit| @prot_hits_hash[prthit.id].each {|pep| pep.proteins << prthit } }
|
38
38
|
# big_guy has all the peptides, so it takes them all
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
reply.first.first.first.id.is 'big_guy'
|
39
|
+
protein_groups = Ms::Ident::ProteinGroup.peptide_hits_to_protein_groups(@pep_hits)
|
40
|
+
protein_groups.first.size.is 1# the group
|
41
|
+
protein_groups.first.first.id.is 'big_guy'
|
43
42
|
end
|
44
43
|
|
45
44
|
it 'removes proteins accounted for only as little pieces of larger proteins' do
|
46
45
|
@prot_hits[1..-1].each {|prthit| @prot_hits_hash[prthit.id].each {|pep| pep.proteins << prthit } }
|
47
|
-
|
46
|
+
protein_groups = Ms::Ident::ProteinGroup.peptide_hits_to_protein_groups(@pep_hits)
|
48
47
|
# no subsumed_by_medium
|
49
|
-
|
48
|
+
protein_groups.any? {|prot_group| prot_group.any? {|v| v.id == 'subsumed_by_medium' }}.is false
|
50
49
|
end
|
51
50
|
|
52
51
|
it 'allows alternate sorting algorithms for greediness' do
|
53
52
|
@prot_hits.each {|prthit| @prot_hits_hash[prthit.id].each {|pep| pep.proteins << prthit } }
|
54
|
-
|
53
|
+
prot_groups = Ms::Ident::ProteinGroup.peptide_hits_to_protein_groups(@pep_hits) do |prot_and_peptide_hits|
|
55
54
|
# deliberate using a counterintuitive sorting method to give little guys
|
56
55
|
# a chance
|
57
56
|
-prot_and_peptide_hits.last.size
|
@@ -61,7 +60,7 @@ describe 'creating minimal protein groups from peptide hits' do
|
|
61
60
|
# to add to the mix. This demonstrates how proteins can be weighted in
|
62
61
|
# different ways based on their peptide hits.
|
63
62
|
seen = []
|
64
|
-
|
63
|
+
prot_groups.each {|pg| pg.each {|prot| seen << prot.id } }
|
65
64
|
# big guy is completely accounted for in the now prioritized little guy
|
66
65
|
# and medium guys, etc.
|
67
66
|
seen.sort.is @prot_hits_hash.keys[1..-1].sort
|
metadata
CHANGED
@@ -5,8 +5,8 @@ version: !ruby/object:Gem::Version
|
|
5
5
|
segments:
|
6
6
|
- 0
|
7
7
|
- 0
|
8
|
-
-
|
9
|
-
version: 0.0.
|
8
|
+
- 19
|
9
|
+
version: 0.0.19
|
10
10
|
platform: ruby
|
11
11
|
authors:
|
12
12
|
- John T. Prince
|
@@ -14,7 +14,7 @@ autorequire:
|
|
14
14
|
bindir: bin
|
15
15
|
cert_chain: []
|
16
16
|
|
17
|
-
date: 2011-03-
|
17
|
+
date: 2011-03-30 00:00:00 -06:00
|
18
18
|
default_executable:
|
19
19
|
dependencies:
|
20
20
|
- !ruby/object:Gem::Dependency
|
@@ -97,7 +97,7 @@ dependencies:
|
|
97
97
|
version: "0"
|
98
98
|
type: :development
|
99
99
|
version_requirements: *id006
|
100
|
-
description: mspire library for working with mzIdentML and
|
100
|
+
description: mspire library for working with mzIdentML, pepxml, and related.
|
101
101
|
email: jtprince@gmail.com
|
102
102
|
executables: []
|
103
103
|
|
@@ -116,6 +116,8 @@ files:
|
|
116
116
|
- lib/ms/ident.rb
|
117
117
|
- lib/ms/ident/peptide.rb
|
118
118
|
- lib/ms/ident/peptide/db.rb
|
119
|
+
- lib/ms/ident/peptide_hit.rb
|
120
|
+
- lib/ms/ident/peptide_hit/qvalue.rb
|
119
121
|
- lib/ms/ident/pepxml.rb
|
120
122
|
- lib/ms/ident/pepxml/modifications.rb
|
121
123
|
- lib/ms/ident/pepxml/msms_pipeline_analysis.rb
|
@@ -131,6 +133,8 @@ files:
|
|
131
133
|
- lib/ms/ident/pepxml/search_summary.rb
|
132
134
|
- lib/ms/ident/pepxml/spectrum_query.rb
|
133
135
|
- lib/ms/ident/protein.rb
|
136
|
+
- lib/ms/ident/protein_group.rb
|
137
|
+
- lib/ms/ident/protein_hit.rb
|
134
138
|
- lib/ms/ident/search.rb
|
135
139
|
- schema/pepXML_v115.xsd
|
136
140
|
- schema/pepXML_v19.xsd
|
@@ -138,7 +142,7 @@ files:
|
|
138
142
|
- spec/ms/ident/pepxml/sample_enzyme_spec.rb
|
139
143
|
- spec/ms/ident/pepxml/search_hit/modification_info_spec.rb
|
140
144
|
- spec/ms/ident/pepxml_spec.rb
|
141
|
-
- spec/ms/ident/
|
145
|
+
- spec/ms/ident/protein_group_spec.rb
|
142
146
|
- spec/spec_helper.rb
|
143
147
|
- spec/tfiles/ms/ident/peptide/db/uni_11_sp_tr.fasta
|
144
148
|
- spec/tfiles/ms/ident/peptide/db/uni_11_sp_tr.msd_clvg2.min_aaseq4.yml
|
@@ -179,5 +183,5 @@ test_files:
|
|
179
183
|
- spec/ms/ident/pepxml/sample_enzyme_spec.rb
|
180
184
|
- spec/ms/ident/pepxml/search_hit/modification_info_spec.rb
|
181
185
|
- spec/ms/ident/pepxml_spec.rb
|
182
|
-
- spec/ms/ident/
|
186
|
+
- spec/ms/ident/protein_group_spec.rb
|
183
187
|
- spec/spec_helper.rb
|