ms-ident 0.0.17 → 0.0.18
Sign up to get free protection for your applications and to get access to all the features.
- data/Rakefile +7 -12
- data/VERSION +1 -1
- data/lib/ms/ident/peptide/db.rb +224 -0
- data/lib/ms/ident/peptide.rb +0 -1
- data/lib/ms/ident/protein.rb +54 -1
- data/spec/ms/ident/peptide/db_spec.rb +95 -0
- data/spec/ms/ident/protein_spec.rb +69 -0
- data/spec/spec_helper.rb +1 -12
- data/spec/tfiles/ms/ident/peptide/db/uni_11_sp_tr.fasta +69 -0
- data/spec/tfiles/ms/ident/peptide/db/uni_11_sp_tr.msd_clvg2.min_aaseq4.yml +728 -0
- metadata +24 -39
- data/Gemfile +0 -31
- data/Gemfile.lock +0 -32
data/Rakefile
CHANGED
@@ -1,12 +1,4 @@
|
|
1
1
|
require 'rubygems'
|
2
|
-
require 'bundler'
|
3
|
-
begin
|
4
|
-
Bundler.setup(:default, :development)
|
5
|
-
rescue Bundler::BundlerError => e
|
6
|
-
$stderr.puts e.message
|
7
|
-
$stderr.puts "Run `bundle install` to install missing gems"
|
8
|
-
exit e.status_code
|
9
|
-
end
|
10
2
|
require 'rake'
|
11
3
|
|
12
4
|
require 'jeweler'
|
@@ -20,10 +12,13 @@ Jeweler::Tasks.new do |gem|
|
|
20
12
|
gem.email = "jtprince@gmail.com"
|
21
13
|
gem.authors = ["John T. Prince"]
|
22
14
|
gem.rubyforge_project = 'mspire'
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
15
|
+
gem.add_runtime_dependency 'nokogiri'
|
16
|
+
gem.add_runtime_dependency 'ms-core', ">=0.0.12"
|
17
|
+
gem.add_runtime_dependency 'ms-in_silico'
|
18
|
+
gem.add_runtime_dependency 'andand'
|
19
|
+
gem.add_development_dependency 'spec-more'
|
20
|
+
gem.add_development_dependency 'jeweler'
|
21
|
+
#gem.add_development_dependency 'ms-testdata'
|
27
22
|
end
|
28
23
|
Jeweler::RubygemsDotOrgTasks.new
|
29
24
|
|
data/VERSION
CHANGED
@@ -1 +1 @@
|
|
1
|
-
0.0.
|
1
|
+
0.0.18
|
@@ -0,0 +1,224 @@
|
|
1
|
+
require 'ms/in_silico/digester'
|
2
|
+
require 'ms/fasta'
|
3
|
+
|
4
|
+
module Ms ; end
|
5
|
+
module Ms::Ident ; end
|
6
|
+
module Ms::Ident::Peptide ; end
|
7
|
+
|
8
|
+
module Ms::Ident::Peptide::Db
|
9
|
+
MAX_NUM_AA_EXPANSION = 3
|
10
|
+
|
11
|
+
# the twenty standard amino acids
|
12
|
+
STANDARD_AA = %w(A C D E F G H I K L M N P Q R S T V W Y)
|
13
|
+
|
14
|
+
DEFAULT_PEPTIDE_CENTRIC_DB = {:missed_cleavages => 2, :min_length => 4, :enzyme => Ms::InSilico::Digester::TRYPSIN, :id_regexp => nil, :remove_digestion_file => true, :cleave_initiator_methionine => true, :expand_aa => {'X' => STANDARD_AA}}
|
15
|
+
|
16
|
+
PROTEIN_DELIMITER = "\t"
|
17
|
+
KEY_VALUE_DELIMITER = ": "
|
18
|
+
|
19
|
+
def self.cmdline(argv)
|
20
|
+
|
21
|
+
opt = {
|
22
|
+
:remove_digestion_file => true,
|
23
|
+
:enzyme => Ms::InSilico::Digester::TRYPSIN
|
24
|
+
}
|
25
|
+
opts = OptionParser.new do |op|
|
26
|
+
op.banner = "usage: #{File.basename($0)} <file>.fasta ..."
|
27
|
+
op.separator "output: "
|
28
|
+
op.separator " <file>.msd_clvg<missed_cleavages>.min_aaseq<min_length>.yml"
|
29
|
+
op.separator "format:"
|
30
|
+
op.separator " PEPTIDE: ID1<tab>ID2<tab>ID3..."
|
31
|
+
op.separator ""
|
32
|
+
op.separator " Initiator Methionines - by default, will generate two peptides"
|
33
|
+
op.separator " for any peptide found at the N-termini starting with 'M'"
|
34
|
+
op.separator " (i.e., one with and one without the leading methionine)"
|
35
|
+
op.separator ""
|
36
|
+
op.on("--missed-cleavages <#{opt[:missed_cleavages]}>", Integer, "max num of missed cleavages") {|v| opt[:missed_cleavages] = v }
|
37
|
+
op.on("--min-length <#{opt[:min_length]}>", Integer, "the minimum peptide aaseq length") {|v| opt[:min_length] = v }
|
38
|
+
op.on("--no-cleaved-methionine", "does not cleave off initiator methionine") { opt[:cleave_initiator_methionine] = false }
|
39
|
+
op.on("--no-expand-x", "don't enumerate aa 'X' possibilities") { opt[:expand_aa] = nil }
|
40
|
+
op.on("-e", "--enzyme <name>", "enzyme for digestion") {|v| opt[:enzyme] = Ms::Insilico::Digester.const_get(v.upcase) }
|
41
|
+
op.on("--list-enzymes", "lists approved enzymes and exits") do
|
42
|
+
puts Ms::InSilico::Digester::ENZYMES.keys.join("\n")
|
43
|
+
exit
|
44
|
+
end
|
45
|
+
end
|
46
|
+
|
47
|
+
opts.parse!(argv)
|
48
|
+
|
49
|
+
if argv.size == 0
|
50
|
+
puts opts || exit
|
51
|
+
end
|
52
|
+
|
53
|
+
argv.map do |file|
|
54
|
+
Ms::Ident::Peptide::Db.peptide_centric_db(file, opt)
|
55
|
+
end
|
56
|
+
end
|
57
|
+
|
58
|
+
# writes a new file with the added 'min_aaseq<Integer>'
|
59
|
+
# creates a temporary digestion file that contains all peptides digesting
|
60
|
+
# with certain missed_cleavages (i.e., min_seq_length is not applied to
|
61
|
+
# this file but on the final peptide centric db)
|
62
|
+
# returns the full name of the written file.
|
63
|
+
def self.peptide_centric_db(fasta_file, opts={})
|
64
|
+
opts = DEFAULT_PEPTIDE_CENTRIC_DB.merge(opts)
|
65
|
+
|
66
|
+
(missed_cleavages, min_length, enzyme, id_regexp, remove_digestion_file, cleave_initiator_methionine, expand_aa) = opts.values_at(:missed_cleavages, :min_length, :enzyme, :id_regexp, :remove_digestion_file, :cleave_initiator_methionine, :expand_aa)
|
67
|
+
start_time = Time.now
|
68
|
+
print "Digesting #{fasta_file} ..." if $VERBOSE
|
69
|
+
|
70
|
+
if expand_aa
|
71
|
+
letters_to_expand_re = Regexp.new("[" << Regexp.escape(expand_aa.keys.join) << "]")
|
72
|
+
end
|
73
|
+
|
74
|
+
base = fasta_file.chomp(File.extname(fasta_file))
|
75
|
+
digestion_file = base + ".msd_clvg#{missed_cleavages}.peptides"
|
76
|
+
File.open(digestion_file, "w") do |fh|
|
77
|
+
Ms::Fasta.open(fasta_file) do |fasta|
|
78
|
+
fasta.each do |prot|
|
79
|
+
peptides = enzyme.digest(prot.sequence, missed_cleavages)
|
80
|
+
if (cleave_initiator_methionine && (prot.sequence[0,1] == "M"))
|
81
|
+
m_peps = []
|
82
|
+
init_methionine_peps = []
|
83
|
+
peptides.each do |pep|
|
84
|
+
# if the peptide is at the beginning of the protein sequence
|
85
|
+
if prot.sequence[0,pep.size] == pep
|
86
|
+
m_peps << pep[1..-1]
|
87
|
+
end
|
88
|
+
end
|
89
|
+
peptides.push(*m_peps)
|
90
|
+
end
|
91
|
+
if expand_aa
|
92
|
+
peptides = peptides.map do |pep|
|
93
|
+
if pep =~ letters_to_expand_re
|
94
|
+
expand_peptides(pep, expand_aa)
|
95
|
+
else
|
96
|
+
pep
|
97
|
+
end
|
98
|
+
end.flatten
|
99
|
+
end
|
100
|
+
fh.puts( prot.header.split(/\s+/).first + "\t" + peptides.join(" ") )
|
101
|
+
end
|
102
|
+
end
|
103
|
+
end
|
104
|
+
puts "#{Time.now - start_time} sec" if $VERBOSE
|
105
|
+
|
106
|
+
|
107
|
+
start_time = Time.now
|
108
|
+
print "Organizing raw digestion #{digestion_file} ..." if $VERBOSE
|
109
|
+
|
110
|
+
hash = Hash.new {|h,k| h[k] = [] }
|
111
|
+
::IO.foreach(digestion_file) do |line|
|
112
|
+
(prot, *peps) = line.chomp!.split(/\s+/)
|
113
|
+
# prot is something like this: "sp|P31946|1433B_HUMAN" in uniprot
|
114
|
+
peps.each do |pep|
|
115
|
+
if pep.size >= min_length
|
116
|
+
hash[pep] << prot
|
117
|
+
end
|
118
|
+
end
|
119
|
+
end
|
120
|
+
puts "#{Time.now - start_time} sec" if $VERBOSE
|
121
|
+
|
122
|
+
base = digestion_file.chomp(File.extname(digestion_file))
|
123
|
+
final_outfile = base + ".min_aaseq#{min_length}" + ".yml"
|
124
|
+
|
125
|
+
start_time = Time.now
|
126
|
+
print "Writing results to #{} ..." if $VERBOSE
|
127
|
+
|
128
|
+
File.open(final_outfile, 'w') do |out|
|
129
|
+
hash.each do |k,v|
|
130
|
+
out.puts( [k, v.join(PROTEIN_DELIMITER)].join(KEY_VALUE_DELIMITER) )
|
131
|
+
end
|
132
|
+
end
|
133
|
+
puts "#{Time.now - start_time} sec" if $VERBOSE
|
134
|
+
|
135
|
+
if remove_digestion_file
|
136
|
+
File.unlink(digestion_file)
|
137
|
+
end
|
138
|
+
File.expand_path(final_outfile)
|
139
|
+
end
|
140
|
+
|
141
|
+
# does combinatorial expansion of all letters requesting it.
|
142
|
+
# expand_aa is hash like: {'X'=>STANDARD_AA}
|
143
|
+
# returns nil if there are more than MAX_NUM_AA_EXPANSION amino acids to
|
144
|
+
# be expanded
|
145
|
+
# returns an empty array if there is no expansion
|
146
|
+
def self.expand_peptides(peptide, expand_aa)
|
147
|
+
letters_in_order = expand_aa.keys.sort
|
148
|
+
index_and_key = []
|
149
|
+
peptide.split('').each_with_index do |char,i|
|
150
|
+
if let_index = letters_in_order.index(char)
|
151
|
+
index_and_key << [i, letters_in_order[let_index]]
|
152
|
+
end
|
153
|
+
end
|
154
|
+
if index_and_key.size > MAX_NUM_AA_EXPANSION
|
155
|
+
return nil
|
156
|
+
end
|
157
|
+
to_expand = [peptide]
|
158
|
+
index_and_key.each do |i,letter|
|
159
|
+
new_peps = []
|
160
|
+
while current_pep = to_expand.shift do
|
161
|
+
new_peps << expand_aa[letter].map {|v| dp = current_pep.dup ; dp[i] = v ; dp }
|
162
|
+
end
|
163
|
+
to_expand = new_peps.flatten
|
164
|
+
end
|
165
|
+
to_expand
|
166
|
+
end
|
167
|
+
|
168
|
+
# an object for on disk retrieval of db entries
|
169
|
+
# proteins are returned as an array.
|
170
|
+
# behaves much like a hash once it is opened.
|
171
|
+
class IO
|
172
|
+
include Enumerable
|
173
|
+
def self.open(filename, &block)
|
174
|
+
File.open(filename) do |io|
|
175
|
+
block.call(self.new(io))
|
176
|
+
end
|
177
|
+
end
|
178
|
+
|
179
|
+
attr_accessor :io
|
180
|
+
attr_accessor :index
|
181
|
+
|
182
|
+
def initialize(io)
|
183
|
+
@io = io
|
184
|
+
@index = {}
|
185
|
+
re = /^(\w+)#{Regexp.escape(KEY_VALUE_DELIMITER)}/
|
186
|
+
prev_io_pos = io.pos
|
187
|
+
triplets = io.each_line.map do |line|
|
188
|
+
key = re.match(line)[1]
|
189
|
+
[key, prev_io_pos + key.bytesize+KEY_VALUE_DELIMITER.bytesize, prev_io_pos=io.pos]
|
190
|
+
end
|
191
|
+
triplets.each do |key, start, end_pos|
|
192
|
+
@index[key] = [start, end_pos-start]
|
193
|
+
end
|
194
|
+
end
|
195
|
+
# returns an array of proteins for the given key (peptide aaseq)
|
196
|
+
def [](key)
|
197
|
+
(start, length) = @index[key]
|
198
|
+
@io.seek(start)
|
199
|
+
string = @io.read(length)
|
200
|
+
string.chomp!
|
201
|
+
string.split("\t")
|
202
|
+
end
|
203
|
+
|
204
|
+
# number of entries
|
205
|
+
def size ; @index.size end
|
206
|
+
alias_method :length, :size
|
207
|
+
|
208
|
+
def keys
|
209
|
+
@index.keys
|
210
|
+
end
|
211
|
+
|
212
|
+
# all the protein lists
|
213
|
+
def values
|
214
|
+
keys.map {|key| self[key] }
|
215
|
+
end
|
216
|
+
|
217
|
+
# yields a pair of aaseq and protein array
|
218
|
+
def each(&block)
|
219
|
+
@index.each do |key, start_length|
|
220
|
+
block.call([key, self[key]])
|
221
|
+
end
|
222
|
+
end
|
223
|
+
end
|
224
|
+
end
|
data/lib/ms/ident/peptide.rb
CHANGED
data/lib/ms/ident/protein.rb
CHANGED
@@ -1,7 +1,8 @@
|
|
1
|
-
|
2
1
|
module Ms ; end
|
3
2
|
module Ms::Ident ; end
|
4
3
|
|
4
|
+
require 'set'
|
5
|
+
|
5
6
|
module Ms::Ident::Protein
|
6
7
|
|
7
8
|
class << self
|
@@ -13,5 +14,57 @@ module Ms::Ident::Protein
|
|
13
14
|
reference.split(/[\s\r]/)[0]
|
14
15
|
end
|
15
16
|
|
17
|
+
PRIORITIZE_PROTEINS = lambda do |protein_group_and_peptide_hits|
|
18
|
+
peptide_hits = protein_group_and_peptide_hits.last
|
19
|
+
num_uniq_aaseqs = peptide_hits.map {|hit| hit.aaseq }.uniq.size
|
20
|
+
num_uniq_aaseqs_at_z = peptide_hits.map {|hit| [hit.aaseq, hit.charge] }.uniq.size
|
21
|
+
[num_uniq_aaseqs, num_uniq_aaseqs_at_z, peptide_hits.size]
|
22
|
+
end
|
23
|
+
|
24
|
+
|
25
|
+
module_function
|
26
|
+
# greedy algorithm to map a set of peptide_hits to protein groups. each
|
27
|
+
# peptide hit should respond to :aaseq, :charge, :proteins if a block is
|
28
|
+
# given, yields a single argument: a doublet of protein_group and peptide
|
29
|
+
# set. It expects a metric or array to sort by for creating greedy protein
|
30
|
+
# groups (the greediest proteins should sort to the back of the array). if
|
31
|
+
# no block is given, the groups are sorted by [# uniq aaseqs, # uniq
|
32
|
+
# aaseq+charge, # peptide_hits] (see PRIORITIZE_PROTEINS). Sets of
|
33
|
+
# peptide_hits and the objects returned by peptide_hit#proteins are used as
|
34
|
+
# hash keys. As long as each peptide hit has a unique signature (like an
|
35
|
+
# id) then any object will work. If they are Struct objects, you might
|
36
|
+
# consider redefining the #hash method to be object_id for performance and
|
37
|
+
# accuracy.
|
38
|
+
def peptide_hits_to_protein_groups(peptide_hits, &sort_by)
|
39
|
+
sort_by ||= PRIORITIZE_PROTEINS
|
40
|
+
# note to self: I wrote this in 2011, so I think I know what I'm doing now
|
41
|
+
protein_to_peptides = Hash.new {|h,k| h[k] = Set.new }
|
42
|
+
peptide_hits.each do |peptide_hit|
|
43
|
+
peptide_hit.proteins.each do |protein|
|
44
|
+
protein_to_peptides[protein] << peptide_hit
|
45
|
+
end
|
46
|
+
end
|
47
|
+
peptides_to_protein_group = Hash.new {|h,k| h[k] = [] }
|
48
|
+
protein_to_peptides.each do |protein, peptide_set|
|
49
|
+
peptides_to_protein_group[peptide_set] << protein
|
50
|
+
end
|
51
|
+
protein_group_to_peptides = peptides_to_protein_group.invert
|
52
|
+
greedy_first = protein_group_to_peptides.sort_by(&sort_by).reverse
|
53
|
+
accounted_for = Set.new
|
54
|
+
surviving_protein_groups = []
|
55
|
+
# we are discarding the subsumed sets, but we could get them with
|
56
|
+
# partition
|
57
|
+
greedy_first.select do |group, peptide_set|
|
58
|
+
has_an_unaccounted_peptide = false
|
59
|
+
peptide_set.each do |peptide_hit|
|
60
|
+
unless accounted_for.include?(peptide_hit)
|
61
|
+
has_an_unaccounted_peptide = true
|
62
|
+
accounted_for.add(peptide_hit)
|
63
|
+
end
|
64
|
+
end
|
65
|
+
has_an_unaccounted_peptide
|
66
|
+
end
|
67
|
+
end
|
68
|
+
|
16
69
|
end
|
17
70
|
|
@@ -0,0 +1,95 @@
|
|
1
|
+
require 'spec_helper'
|
2
|
+
|
3
|
+
require 'yaml'
|
4
|
+
path = 'ms/ident/peptide/db'
|
5
|
+
require path
|
6
|
+
|
7
|
+
module Kernel
|
8
|
+
|
9
|
+
def capture_stdout
|
10
|
+
out = StringIO.new
|
11
|
+
$stdout = out
|
12
|
+
yield
|
13
|
+
out.rewind
|
14
|
+
return out.read
|
15
|
+
ensure
|
16
|
+
$stdout = STDOUT
|
17
|
+
end
|
18
|
+
|
19
|
+
end
|
20
|
+
|
21
|
+
FASTA_FILE = [TESTFILES, path, 'uni_11_sp_tr.fasta'].join('/')
|
22
|
+
|
23
|
+
describe 'amino acid expansion' do
|
24
|
+
|
25
|
+
it 'can expand out wildcard amino acid combinations' do
|
26
|
+
array = Ms::Ident::Peptide::Db.expand_peptides('ALXX', 'X' => %w(* % &), 'L' => %w(P Q) )
|
27
|
+
array.sort.is %w(AP** AP*% AP*& AP%* AP%% AP%& AP&* AP&% AP&& AQ** AQ*% AQ*& AQ%* AQ%% AQ%& AQ&* AQ&% AQ&&).sort
|
28
|
+
end
|
29
|
+
|
30
|
+
it 'will not expand explosive combinations (>MAX_NUM_AA_EXPANSION)' do
|
31
|
+
# this is from real data
|
32
|
+
worst_case = 'LTLLRPEKHEAATGVDTICTHRVDPIGPGLXXEXLYWELSXLTXXIXELGPYTLDR'
|
33
|
+
Ms::Ident::Peptide::Db.expand_peptides(worst_case, 'X' => %w(* % &)).nil?.is true
|
34
|
+
end
|
35
|
+
|
36
|
+
it 'returns the peptide in the array if no expansion' do
|
37
|
+
array = Ms::Ident::Peptide::Db.expand_peptides('ZZZZZ', 'X' => %w(* % &), 'L' => %w(P Q) )
|
38
|
+
array.is ['ZZZZZ']
|
39
|
+
end
|
40
|
+
|
41
|
+
end
|
42
|
+
|
43
|
+
describe 'creating a peptide centric database' do
|
44
|
+
|
45
|
+
before do
|
46
|
+
|
47
|
+
#@output_file = [TESTFILES, path, 'uni_11_sp_tr.'].join('/')
|
48
|
+
@output_file = [TESTFILES, path, "uni_11_sp_tr.msd_clvg2.min_aaseq4.yml"].join('/')
|
49
|
+
end
|
50
|
+
|
51
|
+
it 'converts a fasta file into peptide centric db' do
|
52
|
+
output_files = Ms::Ident::Peptide::Db.cmdline([FASTA_FILE])
|
53
|
+
output_files.first.is File.expand_path(@output_file)
|
54
|
+
ok File.exist?(@output_file)
|
55
|
+
hash = {}
|
56
|
+
YAML.load_file(@output_file).each do |k,v|
|
57
|
+
hash[k] = v.split("\t")
|
58
|
+
end
|
59
|
+
sorted = hash.sort
|
60
|
+
# these are merely frozen, not perfectly defined
|
61
|
+
sorted.first.is ["AAFDDAIAELDTLSEESYK", ["sp|P62258|1433E_HUMAN"]]
|
62
|
+
sorted.last.is ["YWCRLGPPRWICQTIVSTNQYTHHR", ["tr|D2KTA8|D2KTA8_HUMAN"]]
|
63
|
+
sorted.size.is 728
|
64
|
+
File.unlink(@output_file)
|
65
|
+
end
|
66
|
+
|
67
|
+
it 'lists approved enzymes and exits' do
|
68
|
+
output = capture_stdout do
|
69
|
+
begin
|
70
|
+
Ms::Ident::Peptide::Db.cmdline(['--list-enzymes'])
|
71
|
+
rescue SystemExit
|
72
|
+
1.is 1 # we exited
|
73
|
+
end
|
74
|
+
end
|
75
|
+
lines = output.split("\n")
|
76
|
+
ok lines.include?("trypsin")
|
77
|
+
ok lines.include?("chymotrypsin")
|
78
|
+
end
|
79
|
+
end
|
80
|
+
|
81
|
+
describe 'reading a peptide centric database' do
|
82
|
+
outfiles = Ms::Ident::Peptide::Db.cmdline([FASTA_FILE])
|
83
|
+
@outfile = outfiles.first
|
84
|
+
|
85
|
+
it 'reads the file on disk with random access or is enumerable' do
|
86
|
+
Ms::Ident::Peptide::Db::IO.open(@outfile) do |io|
|
87
|
+
io["AVTEQGHELSNEER"].enums %w(sp|P31946|1433B_HUMAN sp|P31946-2|1433B_HUMAN)
|
88
|
+
io["VRAAR"].enums ["tr|D3DX18|D3DX18_HUMAN"]
|
89
|
+
io.each_with_index do |key_prots, i|
|
90
|
+
key_prots.first.isa String
|
91
|
+
key_prots.last.isa Array
|
92
|
+
end
|
93
|
+
end
|
94
|
+
end
|
95
|
+
end
|
@@ -0,0 +1,69 @@
|
|
1
|
+
require 'spec_helper'
|
2
|
+
|
3
|
+
require 'ms/ident/protein'
|
4
|
+
|
5
|
+
PeptideHit = Struct.new(:aaseq, :charge, :proteins) do
|
6
|
+
def inspect # easier to read output
|
7
|
+
"<PeptideHit aaseq=#{self.aaseq} charge=#{self.charge} proteins(ids)=#{self.proteins.map(&:id).join(',')}>"
|
8
|
+
end
|
9
|
+
def hash ; self.object_id end
|
10
|
+
end
|
11
|
+
ProteinHit = Struct.new(:id) do
|
12
|
+
def inspect # easier to read output
|
13
|
+
"<Prt #{self.id}>"
|
14
|
+
end
|
15
|
+
def hash ; self.object_id end
|
16
|
+
end
|
17
|
+
|
18
|
+
describe 'creating minimal protein groups from peptide hits' do
|
19
|
+
before do
|
20
|
+
@pep_hits = [ ['AABBCCDD', 2],
|
21
|
+
['BBCC', 2],
|
22
|
+
['DDEEFFGG', 2],
|
23
|
+
['DDEEFFGG', 3],
|
24
|
+
['HIYA', 2],
|
25
|
+
].map {|ar| PeptideHit.new(ar[0], ar[1], []) }
|
26
|
+
@prot_hits_hash = {
|
27
|
+
'big_guy' => @pep_hits,
|
28
|
+
'little_guy' => [@pep_hits.last],
|
29
|
+
'medium_guy1' => @pep_hits[0,4],
|
30
|
+
'medium_guy2' => @pep_hits[0,4],
|
31
|
+
'subsumed_by_medium' => @pep_hits[2,2],
|
32
|
+
}
|
33
|
+
@prot_hits = @prot_hits_hash.keys.map {|id| ProteinHit.new(id) }
|
34
|
+
end
|
35
|
+
|
36
|
+
it 'is a greedy algorithm' do
|
37
|
+
@prot_hits.each {|prthit| @prot_hits_hash[prthit.id].each {|pep| pep.proteins << prthit } }
|
38
|
+
# big_guy has all the peptides, so it takes them all
|
39
|
+
reply = Ms::Ident::Protein.peptide_hits_to_protein_groups(@pep_hits)
|
40
|
+
reply.first.size.is 2 # the group and the peptide set
|
41
|
+
reply.first.first.size.is 1 # the group
|
42
|
+
reply.first.first.first.id.is 'big_guy'
|
43
|
+
end
|
44
|
+
|
45
|
+
it 'removes proteins accounted for only as little pieces of larger proteins' do
|
46
|
+
@prot_hits[1..-1].each {|prthit| @prot_hits_hash[prthit.id].each {|pep| pep.proteins << prthit } }
|
47
|
+
reply = Ms::Ident::Protein.peptide_hits_to_protein_groups(@pep_hits)
|
48
|
+
# no subsumed_by_medium
|
49
|
+
reply.map(&:first).any? {|protein_list| protein_list.any? {|v| v.id == 'subsumed_by_medium' }}.is false
|
50
|
+
end
|
51
|
+
|
52
|
+
it 'allows alternate sorting algorithms for greediness' do
|
53
|
+
@prot_hits.each {|prthit| @prot_hits_hash[prthit.id].each {|pep| pep.proteins << prthit } }
|
54
|
+
reply = Ms::Ident::Protein.peptide_hits_to_protein_groups(@pep_hits) do |prot_and_peptide_hits|
|
55
|
+
# deliberate using a counterintuitive sorting method to give little guys
|
56
|
+
# a chance
|
57
|
+
-prot_and_peptide_hits.last.size
|
58
|
+
end
|
59
|
+
# because the little proteins are given priority, they 'survive'. Bigger
|
60
|
+
# proteins may also survive if they have at least one unique peptide
|
61
|
+
# to add to the mix. This demonstrates how proteins can be weighted in
|
62
|
+
# different ways based on their peptide hits.
|
63
|
+
seen = []
|
64
|
+
reply.each {|pair| pair.first.each {|prot| seen << prot.id } }
|
65
|
+
# big guy is completely accounted for in the now prioritized little guy
|
66
|
+
# and medium guys, etc.
|
67
|
+
seen.sort.is @prot_hits_hash.keys[1..-1].sort
|
68
|
+
end
|
69
|
+
end
|
data/spec/spec_helper.rb
CHANGED
@@ -1,18 +1,8 @@
|
|
1
1
|
require 'rubygems'
|
2
|
-
require 'bundler'
|
3
2
|
|
4
3
|
$spec_large = ENV['SPEC_LARGE']
|
5
|
-
development = $spec_large ? :development_large : :development
|
6
|
-
|
7
|
-
begin
|
8
|
-
Bundler.setup(:default, development)
|
9
|
-
rescue Bundler::BundlerError => e
|
10
|
-
$stderr.puts e.message
|
11
|
-
$stderr.puts "Run `bundle install` to install missing gems"
|
12
|
-
exit e.status_code
|
13
|
-
end
|
14
|
-
require 'spec/more'
|
15
4
|
|
5
|
+
require 'spec/more'
|
16
6
|
|
17
7
|
load_testdata = lambda do
|
18
8
|
require 'ms/testdata'
|
@@ -26,7 +16,6 @@ $LOAD_PATH.unshift(File.join(File.dirname(__FILE__), '..', 'lib'))
|
|
26
16
|
|
27
17
|
Bacon.summary_on_exit
|
28
18
|
|
29
|
-
|
30
19
|
def spec_large(&block)
|
31
20
|
if $spec_large
|
32
21
|
block.call
|
@@ -0,0 +1,69 @@
|
|
1
|
+
>sp|P31946|1433B_HUMAN 14-3-3 protein beta/alpha OS=Homo sapiens GN=YWHAB PE=1 SV=3
|
2
|
+
MTMDKSELVQKAKLAEQAERYDDMAAAMKAVTEQGHELSNEERNLLSVAYKNVVGARRSS
|
3
|
+
WRVISSIEQKTERNEKKQQMGKEYREKIEAELQDICNDVLELLDKYLIPNATQPESKVFY
|
4
|
+
LKMKGDYFRYLSEVASGDNKQTTVSNSQQAYQEAFEISKKEMQPTHPIRLGLALNFSVFY
|
5
|
+
YEILNSPEKACSLAKTAFDEAIAELDTLNEESYKDSTLIMQLLRDNLTLWTSENQGDEGD
|
6
|
+
AGEGEN
|
7
|
+
>sp|P31946-2|1433B_HUMAN Isoform Short of 14-3-3 protein beta/alpha OS=Homo sapiens GN=YWHAB
|
8
|
+
MDKSELVQKAKLAEQAERYDDMAAAMKAVTEQGHELSNEERNLLSVAYKNVVGARRSSWR
|
9
|
+
VISSIEQKTERNEKKQQMGKEYREKIEAELQDICNDVLELLDKYLIPNATQPESKVFYLK
|
10
|
+
MKGDYFRYLSEVASGDNKQTTVSNSQQAYQEAFEISKKEMQPTHPIRLGLALNFSVFYYE
|
11
|
+
ILNSPEKACSLAKTAFDEAIAELDTLNEESYKDSTLIMQLLRDNLTLWTSENQGDEGDAG
|
12
|
+
EGEN
|
13
|
+
>sp|P62258|1433E_HUMAN 14-3-3 protein epsilon OS=Homo sapiens GN=YWHAE PE=1 SV=1
|
14
|
+
MDDREDLVYQAKLAEQAERYDEMVESMKKVAGMDVELTVEERNLLSVAYKNVIGARRASW
|
15
|
+
RIISSIEQKEENKGGEDKLKMIREYRQMVETELKLICCDILDVLDKHLIPAANTGESKVF
|
16
|
+
YYKMKGDYHRYLAEFATGNDRKEAAENSLVAYKAASDIAMTELPPTHPIRLGLALNFSVF
|
17
|
+
YYEILNSPDRACRLAKAAFDDAIAELDTLSEESYKDSTLIMQLLRDNLTLWTSDMQGDGE
|
18
|
+
EQNKEALQDVEDENQ
|
19
|
+
>sp|Q04917|1433F_HUMAN 14-3-3 protein eta OS=Homo sapiens GN=YWHAH PE=1 SV=4
|
20
|
+
MGDREQLLQRARLAEQAERYDDMASAMKAVTELNEPLSNEDRNLLSVAYKNVVGARRSSW
|
21
|
+
RVISSIEQKTMADGNEKKLEKVKAYREKIEKELETVCNDVLSLLDKFLIKNCNDFQYESK
|
22
|
+
VFYLKMKGDYYRYLAEVASGEKKNSVVEASEAAYKEAFEISKEQMQPTHPIRLGLALNFS
|
23
|
+
VFYYEIQNAPEQACLLAKQAFDDAIAELDTLNEDSYKDSTLIMQLLRDNLTLWTSDQQDE
|
24
|
+
EAGEGN
|
25
|
+
>sp|P61981|1433G_HUMAN 14-3-3 protein gamma OS=Homo sapiens GN=YWHAG PE=1 SV=2
|
26
|
+
MVDREQLVQKARLAEQAERYDDMAAAMKNVTELNEPLSNEERNLLSVAYKNVVGARRSSW
|
27
|
+
RVISSIEQKTSADGNEKKIEMVRAYREKIEKELEAVCQDVLSLLDNYLIKNCSETQYESK
|
28
|
+
VFYLKMKGDYYRYLAEVATGEKRATVVESSEKAYSEAHEISKEHMQPTHPIRLGLALNYS
|
29
|
+
VFYYEIQNAPEQACHLAKTAFDDAIAELDTLNEDSYKDSTLIMQLLRDNLTLWTSDQQDD
|
30
|
+
DGGEGNN
|
31
|
+
>sp|P31947|1433S_HUMAN 14-3-3 protein sigma OS=Homo sapiens GN=SFN PE=1 SV=1
|
32
|
+
MERASLIQKAKLAEQAERYEDMAAFMKGAVEKGEELSCEERNLLSVAYKNVVGGQRAAWR
|
33
|
+
VLSSIEQKSNEEGSEEKGPEVREYREKVETELQGVCDTVLGLLDSHLIKEAGDAESRVFY
|
34
|
+
LKMKGDYYRYLAEVATGDDKKRIIDSARSAYQEAMDISKKEMPPTNPIRLGLALNFSVFH
|
35
|
+
YEIANSPEEAISLAKTTFDEAMADLHTLSEDSYKDSTLIMQLLRDNLTLWTADNAGEEGG
|
36
|
+
EAPQEPQS
|
37
|
+
>tr|D2KLI3|D2KLI3_HUMAN Cytochrome b OS=Homo sapiens GN=CYTB PE=3 SV=1
|
38
|
+
MTPTRKTNPLMKLINHSFIDLPTPSNISAWWNFGSLLGACLILQITTGLFLAMHYSPDAS
|
39
|
+
TAFSSIAHITRDVNYGWIIRYLHANGASMFFICLFLHIGRGLYYGSFLYSETWNIGIILL
|
40
|
+
LATMATAFMGYVLPWGQMSFWGATVITNLLSAIPYIGTDLVQWIWGGYSVDSPTLTRFFT
|
41
|
+
FHFILPFIIAALAALHLLFLHETGSNNPLGITSHSDKITFHPYYTIKDALGLLLFLLSLM
|
42
|
+
TLTLFSPDLLGDPDNYTLANPLNTPPHIKPEWYFLFAYTILRSVPNKLGGVLALLLSILI
|
43
|
+
LAMIPILHMSKQQSMMFRPLSQSLYWLLAADLLILTWIGGQPVSYPFTIIGQVASVLYFT
|
44
|
+
TILILMPTISLIENKMLKWA
|
45
|
+
>tr|D2KTA8|D2KTA8_HUMAN Putative uncharacterized protein FCAMR OS=Homo sapiens GN=FCAMR PE=4 SV=1
|
46
|
+
MDGEATVKPGEQVPLWTHGWPPDDPSPSFAAGSSFALPQKRPHPRWLWEGSLPSRTHLRA
|
47
|
+
MGTLRPSSPLCWREESSFAAPNSLKGSRLVSGEPGGAVTIQCHYAPSSVNRHQRKYWCRL
|
48
|
+
GPPRWICQTIVSTNQYTHHRYRDRVALTDFPQRGLFVVRLSQLSPDDIGCYLCGIGSENN
|
49
|
+
MLFLSMNLTISAGPASTLPTATPAAGELTMRSYGTASPVANRWTPGTTQTLGQGTAWDTV
|
50
|
+
ASTPGTSKTTASAEGRRTPGATRPAAPGTGSWAEGSVKAPAPIPESPPSKSRSMSNTTEG
|
51
|
+
VWEGTRSSVTNRARASKDRREMTTTKADRPREDIEGVRIALDAAKKVLGTIGPPALVSET
|
52
|
+
LAWEILPQATPVSKQQSQGSIGETTPAAGMWTLGTPAADVWILGTPAADVWTSMEAASGE
|
53
|
+
GSAAGDLDAATGDRGPQATLSQTPAVGPWGPPGKESSVKRTFPEDESSSRTLAPVSTMLA
|
54
|
+
LFMLMALVLLQRKLWRRRTSQEAERVTLIQMTHFLEVNPQADQLPHVERKMLQDDSLPAG
|
55
|
+
ASLTAPERNPGP
|
56
|
+
>tr|D2KTA9|D2KTA9_HUMAN Putative uncharacterized protein XKR5 OS=Homo sapiens GN=XKR5 PE=4 SV=1
|
57
|
+
MHARLLGLSALLQAAEQSARLYTVAYYFTTGRLLWGWLALAVLLPGFLVQALSYLWFRAD
|
58
|
+
GHPGHCSLMMLHLLQLGVWKRHWDAALTSLQKELEAPHRGWLQLQEADLSALRLLEALLQ
|
59
|
+
TGPHLLLQTYVFLASDFTDIVPGVSTLFSWSSLSWALVSYTRFMGFMKPGHLAMPWAALF
|
60
|
+
CQQLWRMGMLGTRVLSLVLFYKAYHFWVFVVAGAHWLVMTFWLVAQQSDIIDSTCHWRLF
|
61
|
+
NLLVGAVYILCYLSFWDSPSRNRMVTFYMVMLLENIILLLLATDFLQGASVDQPADHSWG
|
62
|
+
PVWISDWQCLTGNLLQPAASKIHRHLAGLPKEVLWHCRR
|
63
|
+
>tr|D3DSH8|D3DSH8_HUMAN HCG2036819, isoform CRA_a OS=Homo sapiens GN=hCG_2036819 PE=4 SV=1
|
64
|
+
MLGWIQPSRQPQLRAAPPTRTPSAKRCILCNFLPGCWLVGDVAGSRQPSAPQTLRQRQHT
|
65
|
+
RPPPQERGSGRRSPLREARRANPHFKSFPVLEARGLPCGARRTGPRRPVREMTLPSDPER
|
66
|
+
ATLPNPRLGAPAVPRRGPRSHGGRR
|
67
|
+
>tr|D3DX18|D3DX18_HUMAN Putative uncharacterized protein LOC128977 OS=Homo sapiens GN=LOC128977 PE=4 SV=1
|
68
|
+
MADGSGWQPPRPCEAYRAEWKLCRSARHFLHHYYVHGERPACEQWQRDLASCRDWEERRN
|
69
|
+
AEAQQSLCESERARVRAARKHILVWAPRQSPPPDWHLPLPQEKDE
|