ms-ident 0.0.18 → 0.0.19
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/Rakefile +1 -1
- data/VERSION +1 -1
- data/lib/ms/ident/peptide/db.rb +19 -1
- data/lib/ms/ident/peptide_hit/qvalue.rb +55 -0
- data/lib/ms/ident/peptide_hit.rb +8 -0
- data/lib/ms/ident/protein.rb +0 -58
- data/lib/ms/ident/protein_group.rb +72 -0
- data/lib/ms/ident/protein_hit.rb +17 -0
- data/spec/ms/ident/peptide/db_spec.rb +6 -0
- data/spec/ms/ident/{protein_spec.rb → protein_group_spec.rb} +8 -9
- metadata +10 -6
data/Rakefile
CHANGED
@@ -8,7 +8,7 @@ Jeweler::Tasks.new do |gem|
|
|
8
8
|
gem.homepage = "http://github.com/jtprince/ms-ident"
|
9
9
|
gem.license = "MIT"
|
10
10
|
gem.summary = %Q{mspire library for working with mzIdentML and pepxml}
|
11
|
-
gem.description = %Q{mspire library for working with mzIdentML and
|
11
|
+
gem.description = %Q{mspire library for working with mzIdentML, pepxml, and related.}
|
12
12
|
gem.email = "jtprince@gmail.com"
|
13
13
|
gem.authors = ["John T. Prince"]
|
14
14
|
gem.rubyforge_project = 'mspire'
|
data/VERSION
CHANGED
@@ -1 +1 @@
|
|
1
|
-
0.0.
|
1
|
+
0.0.19
|
data/lib/ms/ident/peptide/db.rb
CHANGED
@@ -5,7 +5,11 @@ module Ms ; end
|
|
5
5
|
module Ms::Ident ; end
|
6
6
|
module Ms::Ident::Peptide ; end
|
7
7
|
|
8
|
-
|
8
|
+
# the object itself is a modified Hash.
|
9
|
+
# It is initialized with the database file and a protein array can be
|
10
|
+
# retrieved with the #[] method given an amino acid sequence. All other
|
11
|
+
# methods are untested at this time and should be avoided!
|
12
|
+
class Ms::Ident::Peptide::Db < Hash
|
9
13
|
MAX_NUM_AA_EXPANSION = 3
|
10
14
|
|
11
15
|
# the twenty standard amino acids
|
@@ -165,12 +169,24 @@ module Ms::Ident::Peptide::Db
|
|
165
169
|
to_expand
|
166
170
|
end
|
167
171
|
|
172
|
+
def initialize(db_file)
|
173
|
+
self.replace(YAML.load_file(db_file))
|
174
|
+
end
|
175
|
+
|
176
|
+
alias_method :old_bracket, '[]'.to_sym
|
177
|
+
|
178
|
+
# returns the protein id's as an array
|
179
|
+
def [](key)
|
180
|
+
old_bracket(key).chomp.split(PROTEIN_DELIMITER)
|
181
|
+
end
|
182
|
+
|
168
183
|
# an object for on disk retrieval of db entries
|
169
184
|
# proteins are returned as an array.
|
170
185
|
# behaves much like a hash once it is opened.
|
171
186
|
class IO
|
172
187
|
include Enumerable
|
173
188
|
def self.open(filename, &block)
|
189
|
+
raise ArgumentError unless block
|
174
190
|
File.open(filename) do |io|
|
175
191
|
block.call(self.new(io))
|
176
192
|
end
|
@@ -192,9 +208,11 @@ module Ms::Ident::Peptide::Db
|
|
192
208
|
@index[key] = [start, end_pos-start]
|
193
209
|
end
|
194
210
|
end
|
211
|
+
|
195
212
|
# returns an array of proteins for the given key (peptide aaseq)
|
196
213
|
def [](key)
|
197
214
|
(start, length) = @index[key]
|
215
|
+
return nil unless start
|
198
216
|
@io.seek(start)
|
199
217
|
string = @io.read(length)
|
200
218
|
string.chomp!
|
@@ -0,0 +1,55 @@
|
|
1
|
+
require 'ms/ident/peptide_hit'
|
2
|
+
|
3
|
+
module Ms ; end
|
4
|
+
module Ms::Ident ; end
|
5
|
+
|
6
|
+
class Ms::Ident::PeptideHit
|
7
|
+
module Qvalue
|
8
|
+
attr_accessor :qvalue
|
9
|
+
FILE_EXTENSION = '.phq.tsv'
|
10
|
+
FILE_DELIMITER = "\t"
|
11
|
+
HEADER = %w(aaseq charge qvalue)
|
12
|
+
|
13
|
+
class << self
|
14
|
+
|
15
|
+
# writes to the file, adding an extension
|
16
|
+
def to_phq(base, hits, qvalues=nil)
|
17
|
+
to_file(base + FILE_EXTENSION, hits)
|
18
|
+
end
|
19
|
+
|
20
|
+
# writes the peptide hits to a phq.tsv file. qvalues is a parallel array
|
21
|
+
# to hits that can provide qvalues if not inherent to the hits
|
22
|
+
# returns the filename.
|
23
|
+
def to_file(filename, hits, qvalues=[])
|
24
|
+
File.open(filename,'w') do |out|
|
25
|
+
out.puts HEADER.join(FILE_DELIMITER)
|
26
|
+
hits.zip(qvalues) do |hit, qvalue|
|
27
|
+
out.puts [hit.aaseq, hit.charge, qvalue || hit.qvalue].join(FILE_DELIMITER)
|
28
|
+
end
|
29
|
+
end
|
30
|
+
filename
|
31
|
+
end
|
32
|
+
|
33
|
+
# returns an array of PeptideHit objects from a phq.tsv
|
34
|
+
def from_file(filename)
|
35
|
+
peptide_hits = []
|
36
|
+
File.open(filename) do |io|
|
37
|
+
header = io.readline.chomp.split(FILE_DELIMITER)
|
38
|
+
raise "bad headers" unless header == HEADER
|
39
|
+
io.each do |line|
|
40
|
+
line.chomp!
|
41
|
+
(aaseq, charge, qvalue) = line.split(FILE_DELIMITER)
|
42
|
+
ph = Ms::Ident::PeptideHit.new
|
43
|
+
ph.aaseq = aaseq ; ph.charge = charge.to_i ; ph.qvalue = qvalue.to_f
|
44
|
+
peptide_hits << ph
|
45
|
+
end
|
46
|
+
end
|
47
|
+
peptide_hits
|
48
|
+
end
|
49
|
+
|
50
|
+
alias_method :from_phq, :from_file
|
51
|
+
|
52
|
+
end
|
53
|
+
end # Qvalue
|
54
|
+
include Qvalue
|
55
|
+
end # Peptide Hit
|
data/lib/ms/ident/protein.rb
CHANGED
@@ -1,70 +1,12 @@
|
|
1
1
|
module Ms ; end
|
2
2
|
module Ms::Ident ; end
|
3
3
|
|
4
|
-
require 'set'
|
5
4
|
|
6
5
|
module Ms::Ident::Protein
|
7
|
-
|
8
|
-
class << self
|
9
|
-
end
|
10
|
-
|
11
6
|
# gives the information up until the first space or carriage return.
|
12
7
|
# Assumes the protein can respond_to? :reference
|
13
8
|
def first_entry
|
14
9
|
reference.split(/[\s\r]/)[0]
|
15
10
|
end
|
16
|
-
|
17
|
-
PRIORITIZE_PROTEINS = lambda do |protein_group_and_peptide_hits|
|
18
|
-
peptide_hits = protein_group_and_peptide_hits.last
|
19
|
-
num_uniq_aaseqs = peptide_hits.map {|hit| hit.aaseq }.uniq.size
|
20
|
-
num_uniq_aaseqs_at_z = peptide_hits.map {|hit| [hit.aaseq, hit.charge] }.uniq.size
|
21
|
-
[num_uniq_aaseqs, num_uniq_aaseqs_at_z, peptide_hits.size]
|
22
|
-
end
|
23
|
-
|
24
|
-
|
25
|
-
module_function
|
26
|
-
# greedy algorithm to map a set of peptide_hits to protein groups. each
|
27
|
-
# peptide hit should respond to :aaseq, :charge, :proteins if a block is
|
28
|
-
# given, yields a single argument: a doublet of protein_group and peptide
|
29
|
-
# set. It expects a metric or array to sort by for creating greedy protein
|
30
|
-
# groups (the greediest proteins should sort to the back of the array). if
|
31
|
-
# no block is given, the groups are sorted by [# uniq aaseqs, # uniq
|
32
|
-
# aaseq+charge, # peptide_hits] (see PRIORITIZE_PROTEINS). Sets of
|
33
|
-
# peptide_hits and the objects returned by peptide_hit#proteins are used as
|
34
|
-
# hash keys. As long as each peptide hit has a unique signature (like an
|
35
|
-
# id) then any object will work. If they are Struct objects, you might
|
36
|
-
# consider redefining the #hash method to be object_id for performance and
|
37
|
-
# accuracy.
|
38
|
-
def peptide_hits_to_protein_groups(peptide_hits, &sort_by)
|
39
|
-
sort_by ||= PRIORITIZE_PROTEINS
|
40
|
-
# note to self: I wrote this in 2011, so I think I know what I'm doing now
|
41
|
-
protein_to_peptides = Hash.new {|h,k| h[k] = Set.new }
|
42
|
-
peptide_hits.each do |peptide_hit|
|
43
|
-
peptide_hit.proteins.each do |protein|
|
44
|
-
protein_to_peptides[protein] << peptide_hit
|
45
|
-
end
|
46
|
-
end
|
47
|
-
peptides_to_protein_group = Hash.new {|h,k| h[k] = [] }
|
48
|
-
protein_to_peptides.each do |protein, peptide_set|
|
49
|
-
peptides_to_protein_group[peptide_set] << protein
|
50
|
-
end
|
51
|
-
protein_group_to_peptides = peptides_to_protein_group.invert
|
52
|
-
greedy_first = protein_group_to_peptides.sort_by(&sort_by).reverse
|
53
|
-
accounted_for = Set.new
|
54
|
-
surviving_protein_groups = []
|
55
|
-
# we are discarding the subsumed sets, but we could get them with
|
56
|
-
# partition
|
57
|
-
greedy_first.select do |group, peptide_set|
|
58
|
-
has_an_unaccounted_peptide = false
|
59
|
-
peptide_set.each do |peptide_hit|
|
60
|
-
unless accounted_for.include?(peptide_hit)
|
61
|
-
has_an_unaccounted_peptide = true
|
62
|
-
accounted_for.add(peptide_hit)
|
63
|
-
end
|
64
|
-
end
|
65
|
-
has_an_unaccounted_peptide
|
66
|
-
end
|
67
|
-
end
|
68
|
-
|
69
11
|
end
|
70
12
|
|
@@ -0,0 +1,72 @@
|
|
1
|
+
require 'set'
|
2
|
+
|
3
|
+
module Ms
|
4
|
+
module Ident
|
5
|
+
# represents a group of proteins, typically indistinguishable in the
|
6
|
+
# experiment.
|
7
|
+
class ProteinGroup < Array
|
8
|
+
attr_accessor :peptide_hits
|
9
|
+
|
10
|
+
PRIORITIZE_PROTEINS = lambda do |protein_group_and_peptide_hits|
|
11
|
+
peptide_hits = protein_group_and_peptide_hits.last
|
12
|
+
num_uniq_aaseqs = peptide_hits.map {|hit| hit.aaseq }.uniq.size
|
13
|
+
num_uniq_aaseqs_at_z = peptide_hits.map {|hit| [hit.aaseq, hit.charge] }.uniq.size
|
14
|
+
[num_uniq_aaseqs, num_uniq_aaseqs_at_z, peptide_hits.size]
|
15
|
+
end
|
16
|
+
|
17
|
+
# greedy algorithm to map a set of peptide_hits to protein groups. each
|
18
|
+
# peptide hit should respond to :aaseq, :charge, :proteins if a block is
|
19
|
+
# given, yields a single argument: a doublet of protein_group and peptide
|
20
|
+
# set. It expects a metric or array to sort by for creating greedy protein
|
21
|
+
# groups (the greediest proteins should sort to the back of the array). if
|
22
|
+
# no block is given, the groups are sorted by [# uniq aaseqs, # uniq
|
23
|
+
# aaseq+charge, # peptide_hits] (see PRIORITIZE_PROTEINS). Sets of
|
24
|
+
# peptide_hits and the objects returned by peptide_hit#proteins are used as
|
25
|
+
# hash keys. As long as each peptide hit has a unique signature (like an
|
26
|
+
# id) then any object will work. If they are Struct objects, you might
|
27
|
+
# consider redefining the #hash method to be object_id for performance and
|
28
|
+
# accuracy.
|
29
|
+
#
|
30
|
+
# returns an array of ProteinGroup objects, each set with :peptide_hits
|
31
|
+
def self.peptide_hits_to_protein_groups(peptide_hits, &sort_by)
|
32
|
+
sort_by ||= PRIORITIZE_PROTEINS
|
33
|
+
# note to self: I wrote this in 2011, so I think I know what I'm doing now
|
34
|
+
protein_to_peptides = Hash.new {|h,k| h[k] = Set.new }
|
35
|
+
peptide_hits.each do |peptide_hit|
|
36
|
+
peptide_hit.proteins.each do |protein|
|
37
|
+
protein_to_peptides[protein] << peptide_hit
|
38
|
+
end
|
39
|
+
end
|
40
|
+
peptides_to_protein_group = Hash.new {|h,k| h[k] = [] }
|
41
|
+
protein_to_peptides.each do |protein, peptide_set|
|
42
|
+
peptides_to_protein_group[peptide_set] << protein
|
43
|
+
end
|
44
|
+
peptides_to_protein_group.each do |pephits,ar_of_prots|
|
45
|
+
pg = Ms::Ident::ProteinGroup.new(ar_of_prots)
|
46
|
+
pg.peptide_hits = pephits
|
47
|
+
peptides_to_protein_group[pephits] = pg
|
48
|
+
end
|
49
|
+
|
50
|
+
protein_group_to_peptides = peptides_to_protein_group.invert
|
51
|
+
greedy_first = protein_group_to_peptides.sort_by(&sort_by).reverse
|
52
|
+
|
53
|
+
accounted_for = Set.new
|
54
|
+
# we are discarding the subsumed sets, but we could get them with
|
55
|
+
# partition
|
56
|
+
greedy_first.select! do |group, peptide_set|
|
57
|
+
has_an_unaccounted_peptide = false
|
58
|
+
peptide_set.each do |peptide_hit|
|
59
|
+
unless accounted_for.include?(peptide_hit)
|
60
|
+
has_an_unaccounted_peptide = true
|
61
|
+
accounted_for.add(peptide_hit)
|
62
|
+
end
|
63
|
+
end
|
64
|
+
group.peptide_hits = peptide_set if has_an_unaccounted_peptide
|
65
|
+
has_an_unaccounted_peptide
|
66
|
+
end
|
67
|
+
greedy_first.map(&:first)
|
68
|
+
end
|
69
|
+
|
70
|
+
end
|
71
|
+
end
|
72
|
+
end
|
@@ -0,0 +1,17 @@
|
|
1
|
+
|
2
|
+
module Ms ; end
|
3
|
+
module Ms::Ident ; end
|
4
|
+
|
5
|
+
class Ms::Ident::ProteinHit
|
6
|
+
attr_accessor :id
|
7
|
+
attr_accessor :seq
|
8
|
+
alias_method :sequence, :seq
|
9
|
+
alias_method :sequence=, :seq=
|
10
|
+
attr_accessor :peptide_hits
|
11
|
+
|
12
|
+
def initialize(id=nil)
|
13
|
+
@peptide_hits = []
|
14
|
+
@id = id
|
15
|
+
end
|
16
|
+
end
|
17
|
+
|
@@ -82,6 +82,12 @@ describe 'reading a peptide centric database' do
|
|
82
82
|
outfiles = Ms::Ident::Peptide::Db.cmdline([FASTA_FILE])
|
83
83
|
@outfile = outfiles.first
|
84
84
|
|
85
|
+
it 'creates a hash that can retrieve peptides as an array' do
|
86
|
+
hash = Ms::Ident::Peptide::Db.new(@outfile)
|
87
|
+
hash["AVTEQGHELSNEER"].enums %w(sp|P31946|1433B_HUMAN sp|P31946-2|1433B_HUMAN)
|
88
|
+
hash["VRAAR"].enums ["tr|D3DX18|D3DX18_HUMAN"]
|
89
|
+
end
|
90
|
+
|
85
91
|
it 'reads the file on disk with random access or is enumerable' do
|
86
92
|
Ms::Ident::Peptide::Db::IO.open(@outfile) do |io|
|
87
93
|
io["AVTEQGHELSNEER"].enums %w(sp|P31946|1433B_HUMAN sp|P31946-2|1433B_HUMAN)
|
@@ -1,6 +1,6 @@
|
|
1
1
|
require 'spec_helper'
|
2
2
|
|
3
|
-
require 'ms/ident/
|
3
|
+
require 'ms/ident/protein_group'
|
4
4
|
|
5
5
|
PeptideHit = Struct.new(:aaseq, :charge, :proteins) do
|
6
6
|
def inspect # easier to read output
|
@@ -36,22 +36,21 @@ describe 'creating minimal protein groups from peptide hits' do
|
|
36
36
|
it 'is a greedy algorithm' do
|
37
37
|
@prot_hits.each {|prthit| @prot_hits_hash[prthit.id].each {|pep| pep.proteins << prthit } }
|
38
38
|
# big_guy has all the peptides, so it takes them all
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
reply.first.first.first.id.is 'big_guy'
|
39
|
+
protein_groups = Ms::Ident::ProteinGroup.peptide_hits_to_protein_groups(@pep_hits)
|
40
|
+
protein_groups.first.size.is 1# the group
|
41
|
+
protein_groups.first.first.id.is 'big_guy'
|
43
42
|
end
|
44
43
|
|
45
44
|
it 'removes proteins accounted for only as little pieces of larger proteins' do
|
46
45
|
@prot_hits[1..-1].each {|prthit| @prot_hits_hash[prthit.id].each {|pep| pep.proteins << prthit } }
|
47
|
-
|
46
|
+
protein_groups = Ms::Ident::ProteinGroup.peptide_hits_to_protein_groups(@pep_hits)
|
48
47
|
# no subsumed_by_medium
|
49
|
-
|
48
|
+
protein_groups.any? {|prot_group| prot_group.any? {|v| v.id == 'subsumed_by_medium' }}.is false
|
50
49
|
end
|
51
50
|
|
52
51
|
it 'allows alternate sorting algorithms for greediness' do
|
53
52
|
@prot_hits.each {|prthit| @prot_hits_hash[prthit.id].each {|pep| pep.proteins << prthit } }
|
54
|
-
|
53
|
+
prot_groups = Ms::Ident::ProteinGroup.peptide_hits_to_protein_groups(@pep_hits) do |prot_and_peptide_hits|
|
55
54
|
# deliberate using a counterintuitive sorting method to give little guys
|
56
55
|
# a chance
|
57
56
|
-prot_and_peptide_hits.last.size
|
@@ -61,7 +60,7 @@ describe 'creating minimal protein groups from peptide hits' do
|
|
61
60
|
# to add to the mix. This demonstrates how proteins can be weighted in
|
62
61
|
# different ways based on their peptide hits.
|
63
62
|
seen = []
|
64
|
-
|
63
|
+
prot_groups.each {|pg| pg.each {|prot| seen << prot.id } }
|
65
64
|
# big guy is completely accounted for in the now prioritized little guy
|
66
65
|
# and medium guys, etc.
|
67
66
|
seen.sort.is @prot_hits_hash.keys[1..-1].sort
|
metadata
CHANGED
@@ -5,8 +5,8 @@ version: !ruby/object:Gem::Version
|
|
5
5
|
segments:
|
6
6
|
- 0
|
7
7
|
- 0
|
8
|
-
-
|
9
|
-
version: 0.0.
|
8
|
+
- 19
|
9
|
+
version: 0.0.19
|
10
10
|
platform: ruby
|
11
11
|
authors:
|
12
12
|
- John T. Prince
|
@@ -14,7 +14,7 @@ autorequire:
|
|
14
14
|
bindir: bin
|
15
15
|
cert_chain: []
|
16
16
|
|
17
|
-
date: 2011-03-
|
17
|
+
date: 2011-03-30 00:00:00 -06:00
|
18
18
|
default_executable:
|
19
19
|
dependencies:
|
20
20
|
- !ruby/object:Gem::Dependency
|
@@ -97,7 +97,7 @@ dependencies:
|
|
97
97
|
version: "0"
|
98
98
|
type: :development
|
99
99
|
version_requirements: *id006
|
100
|
-
description: mspire library for working with mzIdentML and
|
100
|
+
description: mspire library for working with mzIdentML, pepxml, and related.
|
101
101
|
email: jtprince@gmail.com
|
102
102
|
executables: []
|
103
103
|
|
@@ -116,6 +116,8 @@ files:
|
|
116
116
|
- lib/ms/ident.rb
|
117
117
|
- lib/ms/ident/peptide.rb
|
118
118
|
- lib/ms/ident/peptide/db.rb
|
119
|
+
- lib/ms/ident/peptide_hit.rb
|
120
|
+
- lib/ms/ident/peptide_hit/qvalue.rb
|
119
121
|
- lib/ms/ident/pepxml.rb
|
120
122
|
- lib/ms/ident/pepxml/modifications.rb
|
121
123
|
- lib/ms/ident/pepxml/msms_pipeline_analysis.rb
|
@@ -131,6 +133,8 @@ files:
|
|
131
133
|
- lib/ms/ident/pepxml/search_summary.rb
|
132
134
|
- lib/ms/ident/pepxml/spectrum_query.rb
|
133
135
|
- lib/ms/ident/protein.rb
|
136
|
+
- lib/ms/ident/protein_group.rb
|
137
|
+
- lib/ms/ident/protein_hit.rb
|
134
138
|
- lib/ms/ident/search.rb
|
135
139
|
- schema/pepXML_v115.xsd
|
136
140
|
- schema/pepXML_v19.xsd
|
@@ -138,7 +142,7 @@ files:
|
|
138
142
|
- spec/ms/ident/pepxml/sample_enzyme_spec.rb
|
139
143
|
- spec/ms/ident/pepxml/search_hit/modification_info_spec.rb
|
140
144
|
- spec/ms/ident/pepxml_spec.rb
|
141
|
-
- spec/ms/ident/
|
145
|
+
- spec/ms/ident/protein_group_spec.rb
|
142
146
|
- spec/spec_helper.rb
|
143
147
|
- spec/tfiles/ms/ident/peptide/db/uni_11_sp_tr.fasta
|
144
148
|
- spec/tfiles/ms/ident/peptide/db/uni_11_sp_tr.msd_clvg2.min_aaseq4.yml
|
@@ -179,5 +183,5 @@ test_files:
|
|
179
183
|
- spec/ms/ident/pepxml/sample_enzyme_spec.rb
|
180
184
|
- spec/ms/ident/pepxml/search_hit/modification_info_spec.rb
|
181
185
|
- spec/ms/ident/pepxml_spec.rb
|
182
|
-
- spec/ms/ident/
|
186
|
+
- spec/ms/ident/protein_group_spec.rb
|
183
187
|
- spec/spec_helper.rb
|