ms-ident 0.0.17 → 0.0.18
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/Rakefile +7 -12
- data/VERSION +1 -1
- data/lib/ms/ident/peptide/db.rb +224 -0
- data/lib/ms/ident/peptide.rb +0 -1
- data/lib/ms/ident/protein.rb +54 -1
- data/spec/ms/ident/peptide/db_spec.rb +95 -0
- data/spec/ms/ident/protein_spec.rb +69 -0
- data/spec/spec_helper.rb +1 -12
- data/spec/tfiles/ms/ident/peptide/db/uni_11_sp_tr.fasta +69 -0
- data/spec/tfiles/ms/ident/peptide/db/uni_11_sp_tr.msd_clvg2.min_aaseq4.yml +728 -0
- metadata +24 -39
- data/Gemfile +0 -31
- data/Gemfile.lock +0 -32
data/Rakefile
CHANGED
@@ -1,12 +1,4 @@
|
|
1
1
|
require 'rubygems'
|
2
|
-
require 'bundler'
|
3
|
-
begin
|
4
|
-
Bundler.setup(:default, :development)
|
5
|
-
rescue Bundler::BundlerError => e
|
6
|
-
$stderr.puts e.message
|
7
|
-
$stderr.puts "Run `bundle install` to install missing gems"
|
8
|
-
exit e.status_code
|
9
|
-
end
|
10
2
|
require 'rake'
|
11
3
|
|
12
4
|
require 'jeweler'
|
@@ -20,10 +12,13 @@ Jeweler::Tasks.new do |gem|
|
|
20
12
|
gem.email = "jtprince@gmail.com"
|
21
13
|
gem.authors = ["John T. Prince"]
|
22
14
|
gem.rubyforge_project = 'mspire'
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
15
|
+
gem.add_runtime_dependency 'nokogiri'
|
16
|
+
gem.add_runtime_dependency 'ms-core', ">=0.0.12"
|
17
|
+
gem.add_runtime_dependency 'ms-in_silico'
|
18
|
+
gem.add_runtime_dependency 'andand'
|
19
|
+
gem.add_development_dependency 'spec-more'
|
20
|
+
gem.add_development_dependency 'jeweler'
|
21
|
+
#gem.add_development_dependency 'ms-testdata'
|
27
22
|
end
|
28
23
|
Jeweler::RubygemsDotOrgTasks.new
|
29
24
|
|
data/VERSION
CHANGED
@@ -1 +1 @@
|
|
1
|
-
0.0.
|
1
|
+
0.0.18
|
@@ -0,0 +1,224 @@
|
|
1
|
+
require 'ms/in_silico/digester'
|
2
|
+
require 'ms/fasta'
|
3
|
+
|
4
|
+
module Ms ; end
|
5
|
+
module Ms::Ident ; end
|
6
|
+
module Ms::Ident::Peptide ; end
|
7
|
+
|
8
|
+
module Ms::Ident::Peptide::Db
|
9
|
+
MAX_NUM_AA_EXPANSION = 3
|
10
|
+
|
11
|
+
# the twenty standard amino acids
|
12
|
+
STANDARD_AA = %w(A C D E F G H I K L M N P Q R S T V W Y)
|
13
|
+
|
14
|
+
DEFAULT_PEPTIDE_CENTRIC_DB = {:missed_cleavages => 2, :min_length => 4, :enzyme => Ms::InSilico::Digester::TRYPSIN, :id_regexp => nil, :remove_digestion_file => true, :cleave_initiator_methionine => true, :expand_aa => {'X' => STANDARD_AA}}
|
15
|
+
|
16
|
+
PROTEIN_DELIMITER = "\t"
|
17
|
+
KEY_VALUE_DELIMITER = ": "
|
18
|
+
|
19
|
+
def self.cmdline(argv)
|
20
|
+
|
21
|
+
opt = {
|
22
|
+
:remove_digestion_file => true,
|
23
|
+
:enzyme => Ms::InSilico::Digester::TRYPSIN
|
24
|
+
}
|
25
|
+
opts = OptionParser.new do |op|
|
26
|
+
op.banner = "usage: #{File.basename($0)} <file>.fasta ..."
|
27
|
+
op.separator "output: "
|
28
|
+
op.separator " <file>.msd_clvg<missed_cleavages>.min_aaseq<min_length>.yml"
|
29
|
+
op.separator "format:"
|
30
|
+
op.separator " PEPTIDE: ID1<tab>ID2<tab>ID3..."
|
31
|
+
op.separator ""
|
32
|
+
op.separator " Initiator Methionines - by default, will generate two peptides"
|
33
|
+
op.separator " for any peptide found at the N-termini starting with 'M'"
|
34
|
+
op.separator " (i.e., one with and one without the leading methionine)"
|
35
|
+
op.separator ""
|
36
|
+
op.on("--missed-cleavages <#{opt[:missed_cleavages]}>", Integer, "max num of missed cleavages") {|v| opt[:missed_cleavages] = v }
|
37
|
+
op.on("--min-length <#{opt[:min_length]}>", Integer, "the minimum peptide aaseq length") {|v| opt[:min_length] = v }
|
38
|
+
op.on("--no-cleaved-methionine", "does not cleave off initiator methionine") { opt[:cleave_initiator_methionine] = false }
|
39
|
+
op.on("--no-expand-x", "don't enumerate aa 'X' possibilities") { opt[:expand_aa] = nil }
|
40
|
+
op.on("-e", "--enzyme <name>", "enzyme for digestion") {|v| opt[:enzyme] = Ms::Insilico::Digester.const_get(v.upcase) }
|
41
|
+
op.on("--list-enzymes", "lists approved enzymes and exits") do
|
42
|
+
puts Ms::InSilico::Digester::ENZYMES.keys.join("\n")
|
43
|
+
exit
|
44
|
+
end
|
45
|
+
end
|
46
|
+
|
47
|
+
opts.parse!(argv)
|
48
|
+
|
49
|
+
if argv.size == 0
|
50
|
+
puts opts || exit
|
51
|
+
end
|
52
|
+
|
53
|
+
argv.map do |file|
|
54
|
+
Ms::Ident::Peptide::Db.peptide_centric_db(file, opt)
|
55
|
+
end
|
56
|
+
end
|
57
|
+
|
58
|
+
# writes a new file with the added 'min_aaseq<Integer>'
|
59
|
+
# creates a temporary digestion file that contains all peptides digesting
|
60
|
+
# with certain missed_cleavages (i.e., min_seq_length is not applied to
|
61
|
+
# this file but on the final peptide centric db)
|
62
|
+
# returns the full name of the written file.
|
63
|
+
def self.peptide_centric_db(fasta_file, opts={})
|
64
|
+
opts = DEFAULT_PEPTIDE_CENTRIC_DB.merge(opts)
|
65
|
+
|
66
|
+
(missed_cleavages, min_length, enzyme, id_regexp, remove_digestion_file, cleave_initiator_methionine, expand_aa) = opts.values_at(:missed_cleavages, :min_length, :enzyme, :id_regexp, :remove_digestion_file, :cleave_initiator_methionine, :expand_aa)
|
67
|
+
start_time = Time.now
|
68
|
+
print "Digesting #{fasta_file} ..." if $VERBOSE
|
69
|
+
|
70
|
+
if expand_aa
|
71
|
+
letters_to_expand_re = Regexp.new("[" << Regexp.escape(expand_aa.keys.join) << "]")
|
72
|
+
end
|
73
|
+
|
74
|
+
base = fasta_file.chomp(File.extname(fasta_file))
|
75
|
+
digestion_file = base + ".msd_clvg#{missed_cleavages}.peptides"
|
76
|
+
File.open(digestion_file, "w") do |fh|
|
77
|
+
Ms::Fasta.open(fasta_file) do |fasta|
|
78
|
+
fasta.each do |prot|
|
79
|
+
peptides = enzyme.digest(prot.sequence, missed_cleavages)
|
80
|
+
if (cleave_initiator_methionine && (prot.sequence[0,1] == "M"))
|
81
|
+
m_peps = []
|
82
|
+
init_methionine_peps = []
|
83
|
+
peptides.each do |pep|
|
84
|
+
# if the peptide is at the beginning of the protein sequence
|
85
|
+
if prot.sequence[0,pep.size] == pep
|
86
|
+
m_peps << pep[1..-1]
|
87
|
+
end
|
88
|
+
end
|
89
|
+
peptides.push(*m_peps)
|
90
|
+
end
|
91
|
+
if expand_aa
|
92
|
+
peptides = peptides.map do |pep|
|
93
|
+
if pep =~ letters_to_expand_re
|
94
|
+
expand_peptides(pep, expand_aa)
|
95
|
+
else
|
96
|
+
pep
|
97
|
+
end
|
98
|
+
end.flatten
|
99
|
+
end
|
100
|
+
fh.puts( prot.header.split(/\s+/).first + "\t" + peptides.join(" ") )
|
101
|
+
end
|
102
|
+
end
|
103
|
+
end
|
104
|
+
puts "#{Time.now - start_time} sec" if $VERBOSE
|
105
|
+
|
106
|
+
|
107
|
+
start_time = Time.now
|
108
|
+
print "Organizing raw digestion #{digestion_file} ..." if $VERBOSE
|
109
|
+
|
110
|
+
hash = Hash.new {|h,k| h[k] = [] }
|
111
|
+
::IO.foreach(digestion_file) do |line|
|
112
|
+
(prot, *peps) = line.chomp!.split(/\s+/)
|
113
|
+
# prot is something like this: "sp|P31946|1433B_HUMAN" in uniprot
|
114
|
+
peps.each do |pep|
|
115
|
+
if pep.size >= min_length
|
116
|
+
hash[pep] << prot
|
117
|
+
end
|
118
|
+
end
|
119
|
+
end
|
120
|
+
puts "#{Time.now - start_time} sec" if $VERBOSE
|
121
|
+
|
122
|
+
base = digestion_file.chomp(File.extname(digestion_file))
|
123
|
+
final_outfile = base + ".min_aaseq#{min_length}" + ".yml"
|
124
|
+
|
125
|
+
start_time = Time.now
|
126
|
+
print "Writing results to #{} ..." if $VERBOSE
|
127
|
+
|
128
|
+
File.open(final_outfile, 'w') do |out|
|
129
|
+
hash.each do |k,v|
|
130
|
+
out.puts( [k, v.join(PROTEIN_DELIMITER)].join(KEY_VALUE_DELIMITER) )
|
131
|
+
end
|
132
|
+
end
|
133
|
+
puts "#{Time.now - start_time} sec" if $VERBOSE
|
134
|
+
|
135
|
+
if remove_digestion_file
|
136
|
+
File.unlink(digestion_file)
|
137
|
+
end
|
138
|
+
File.expand_path(final_outfile)
|
139
|
+
end
|
140
|
+
|
141
|
+
# does combinatorial expansion of all letters requesting it.
|
142
|
+
# expand_aa is hash like: {'X'=>STANDARD_AA}
|
143
|
+
# returns nil if there are more than MAX_NUM_AA_EXPANSION amino acids to
|
144
|
+
# be expanded
|
145
|
+
# returns an empty array if there is no expansion
|
146
|
+
def self.expand_peptides(peptide, expand_aa)
|
147
|
+
letters_in_order = expand_aa.keys.sort
|
148
|
+
index_and_key = []
|
149
|
+
peptide.split('').each_with_index do |char,i|
|
150
|
+
if let_index = letters_in_order.index(char)
|
151
|
+
index_and_key << [i, letters_in_order[let_index]]
|
152
|
+
end
|
153
|
+
end
|
154
|
+
if index_and_key.size > MAX_NUM_AA_EXPANSION
|
155
|
+
return nil
|
156
|
+
end
|
157
|
+
to_expand = [peptide]
|
158
|
+
index_and_key.each do |i,letter|
|
159
|
+
new_peps = []
|
160
|
+
while current_pep = to_expand.shift do
|
161
|
+
new_peps << expand_aa[letter].map {|v| dp = current_pep.dup ; dp[i] = v ; dp }
|
162
|
+
end
|
163
|
+
to_expand = new_peps.flatten
|
164
|
+
end
|
165
|
+
to_expand
|
166
|
+
end
|
167
|
+
|
168
|
+
# an object for on disk retrieval of db entries
|
169
|
+
# proteins are returned as an array.
|
170
|
+
# behaves much like a hash once it is opened.
|
171
|
+
class IO
|
172
|
+
include Enumerable
|
173
|
+
def self.open(filename, &block)
|
174
|
+
File.open(filename) do |io|
|
175
|
+
block.call(self.new(io))
|
176
|
+
end
|
177
|
+
end
|
178
|
+
|
179
|
+
attr_accessor :io
|
180
|
+
attr_accessor :index
|
181
|
+
|
182
|
+
def initialize(io)
|
183
|
+
@io = io
|
184
|
+
@index = {}
|
185
|
+
re = /^(\w+)#{Regexp.escape(KEY_VALUE_DELIMITER)}/
|
186
|
+
prev_io_pos = io.pos
|
187
|
+
triplets = io.each_line.map do |line|
|
188
|
+
key = re.match(line)[1]
|
189
|
+
[key, prev_io_pos + key.bytesize+KEY_VALUE_DELIMITER.bytesize, prev_io_pos=io.pos]
|
190
|
+
end
|
191
|
+
triplets.each do |key, start, end_pos|
|
192
|
+
@index[key] = [start, end_pos-start]
|
193
|
+
end
|
194
|
+
end
|
195
|
+
# returns an array of proteins for the given key (peptide aaseq)
|
196
|
+
def [](key)
|
197
|
+
(start, length) = @index[key]
|
198
|
+
@io.seek(start)
|
199
|
+
string = @io.read(length)
|
200
|
+
string.chomp!
|
201
|
+
string.split("\t")
|
202
|
+
end
|
203
|
+
|
204
|
+
# number of entries
|
205
|
+
def size ; @index.size end
|
206
|
+
alias_method :length, :size
|
207
|
+
|
208
|
+
def keys
|
209
|
+
@index.keys
|
210
|
+
end
|
211
|
+
|
212
|
+
# all the protein lists
|
213
|
+
def values
|
214
|
+
keys.map {|key| self[key] }
|
215
|
+
end
|
216
|
+
|
217
|
+
# yields a pair of aaseq and protein array
|
218
|
+
def each(&block)
|
219
|
+
@index.each do |key, start_length|
|
220
|
+
block.call([key, self[key]])
|
221
|
+
end
|
222
|
+
end
|
223
|
+
end
|
224
|
+
end
|
data/lib/ms/ident/peptide.rb
CHANGED
data/lib/ms/ident/protein.rb
CHANGED
@@ -1,7 +1,8 @@
|
|
1
|
-
|
2
1
|
module Ms ; end
|
3
2
|
module Ms::Ident ; end
|
4
3
|
|
4
|
+
require 'set'
|
5
|
+
|
5
6
|
module Ms::Ident::Protein
|
6
7
|
|
7
8
|
class << self
|
@@ -13,5 +14,57 @@ module Ms::Ident::Protein
|
|
13
14
|
reference.split(/[\s\r]/)[0]
|
14
15
|
end
|
15
16
|
|
17
|
+
PRIORITIZE_PROTEINS = lambda do |protein_group_and_peptide_hits|
|
18
|
+
peptide_hits = protein_group_and_peptide_hits.last
|
19
|
+
num_uniq_aaseqs = peptide_hits.map {|hit| hit.aaseq }.uniq.size
|
20
|
+
num_uniq_aaseqs_at_z = peptide_hits.map {|hit| [hit.aaseq, hit.charge] }.uniq.size
|
21
|
+
[num_uniq_aaseqs, num_uniq_aaseqs_at_z, peptide_hits.size]
|
22
|
+
end
|
23
|
+
|
24
|
+
|
25
|
+
module_function
|
26
|
+
# greedy algorithm to map a set of peptide_hits to protein groups. each
|
27
|
+
# peptide hit should respond to :aaseq, :charge, :proteins if a block is
|
28
|
+
# given, yields a single argument: a doublet of protein_group and peptide
|
29
|
+
# set. It expects a metric or array to sort by for creating greedy protein
|
30
|
+
# groups (the greediest proteins should sort to the back of the array). if
|
31
|
+
# no block is given, the groups are sorted by [# uniq aaseqs, # uniq
|
32
|
+
# aaseq+charge, # peptide_hits] (see PRIORITIZE_PROTEINS). Sets of
|
33
|
+
# peptide_hits and the objects returned by peptide_hit#proteins are used as
|
34
|
+
# hash keys. As long as each peptide hit has a unique signature (like an
|
35
|
+
# id) then any object will work. If they are Struct objects, you might
|
36
|
+
# consider redefining the #hash method to be object_id for performance and
|
37
|
+
# accuracy.
|
38
|
+
def peptide_hits_to_protein_groups(peptide_hits, &sort_by)
|
39
|
+
sort_by ||= PRIORITIZE_PROTEINS
|
40
|
+
# note to self: I wrote this in 2011, so I think I know what I'm doing now
|
41
|
+
protein_to_peptides = Hash.new {|h,k| h[k] = Set.new }
|
42
|
+
peptide_hits.each do |peptide_hit|
|
43
|
+
peptide_hit.proteins.each do |protein|
|
44
|
+
protein_to_peptides[protein] << peptide_hit
|
45
|
+
end
|
46
|
+
end
|
47
|
+
peptides_to_protein_group = Hash.new {|h,k| h[k] = [] }
|
48
|
+
protein_to_peptides.each do |protein, peptide_set|
|
49
|
+
peptides_to_protein_group[peptide_set] << protein
|
50
|
+
end
|
51
|
+
protein_group_to_peptides = peptides_to_protein_group.invert
|
52
|
+
greedy_first = protein_group_to_peptides.sort_by(&sort_by).reverse
|
53
|
+
accounted_for = Set.new
|
54
|
+
surviving_protein_groups = []
|
55
|
+
# we are discarding the subsumed sets, but we could get them with
|
56
|
+
# partition
|
57
|
+
greedy_first.select do |group, peptide_set|
|
58
|
+
has_an_unaccounted_peptide = false
|
59
|
+
peptide_set.each do |peptide_hit|
|
60
|
+
unless accounted_for.include?(peptide_hit)
|
61
|
+
has_an_unaccounted_peptide = true
|
62
|
+
accounted_for.add(peptide_hit)
|
63
|
+
end
|
64
|
+
end
|
65
|
+
has_an_unaccounted_peptide
|
66
|
+
end
|
67
|
+
end
|
68
|
+
|
16
69
|
end
|
17
70
|
|
@@ -0,0 +1,95 @@
|
|
1
|
+
require 'spec_helper'
|
2
|
+
|
3
|
+
require 'yaml'
|
4
|
+
path = 'ms/ident/peptide/db'
|
5
|
+
require path
|
6
|
+
|
7
|
+
module Kernel
|
8
|
+
|
9
|
+
def capture_stdout
|
10
|
+
out = StringIO.new
|
11
|
+
$stdout = out
|
12
|
+
yield
|
13
|
+
out.rewind
|
14
|
+
return out.read
|
15
|
+
ensure
|
16
|
+
$stdout = STDOUT
|
17
|
+
end
|
18
|
+
|
19
|
+
end
|
20
|
+
|
21
|
+
FASTA_FILE = [TESTFILES, path, 'uni_11_sp_tr.fasta'].join('/')
|
22
|
+
|
23
|
+
describe 'amino acid expansion' do
|
24
|
+
|
25
|
+
it 'can expand out wildcard amino acid combinations' do
|
26
|
+
array = Ms::Ident::Peptide::Db.expand_peptides('ALXX', 'X' => %w(* % &), 'L' => %w(P Q) )
|
27
|
+
array.sort.is %w(AP** AP*% AP*& AP%* AP%% AP%& AP&* AP&% AP&& AQ** AQ*% AQ*& AQ%* AQ%% AQ%& AQ&* AQ&% AQ&&).sort
|
28
|
+
end
|
29
|
+
|
30
|
+
it 'will not expand explosive combinations (>MAX_NUM_AA_EXPANSION)' do
|
31
|
+
# this is from real data
|
32
|
+
worst_case = 'LTLLRPEKHEAATGVDTICTHRVDPIGPGLXXEXLYWELSXLTXXIXELGPYTLDR'
|
33
|
+
Ms::Ident::Peptide::Db.expand_peptides(worst_case, 'X' => %w(* % &)).nil?.is true
|
34
|
+
end
|
35
|
+
|
36
|
+
it 'returns the peptide in the array if no expansion' do
|
37
|
+
array = Ms::Ident::Peptide::Db.expand_peptides('ZZZZZ', 'X' => %w(* % &), 'L' => %w(P Q) )
|
38
|
+
array.is ['ZZZZZ']
|
39
|
+
end
|
40
|
+
|
41
|
+
end
|
42
|
+
|
43
|
+
describe 'creating a peptide centric database' do
|
44
|
+
|
45
|
+
before do
|
46
|
+
|
47
|
+
#@output_file = [TESTFILES, path, 'uni_11_sp_tr.'].join('/')
|
48
|
+
@output_file = [TESTFILES, path, "uni_11_sp_tr.msd_clvg2.min_aaseq4.yml"].join('/')
|
49
|
+
end
|
50
|
+
|
51
|
+
it 'converts a fasta file into peptide centric db' do
|
52
|
+
output_files = Ms::Ident::Peptide::Db.cmdline([FASTA_FILE])
|
53
|
+
output_files.first.is File.expand_path(@output_file)
|
54
|
+
ok File.exist?(@output_file)
|
55
|
+
hash = {}
|
56
|
+
YAML.load_file(@output_file).each do |k,v|
|
57
|
+
hash[k] = v.split("\t")
|
58
|
+
end
|
59
|
+
sorted = hash.sort
|
60
|
+
# these are merely frozen, not perfectly defined
|
61
|
+
sorted.first.is ["AAFDDAIAELDTLSEESYK", ["sp|P62258|1433E_HUMAN"]]
|
62
|
+
sorted.last.is ["YWCRLGPPRWICQTIVSTNQYTHHR", ["tr|D2KTA8|D2KTA8_HUMAN"]]
|
63
|
+
sorted.size.is 728
|
64
|
+
File.unlink(@output_file)
|
65
|
+
end
|
66
|
+
|
67
|
+
it 'lists approved enzymes and exits' do
|
68
|
+
output = capture_stdout do
|
69
|
+
begin
|
70
|
+
Ms::Ident::Peptide::Db.cmdline(['--list-enzymes'])
|
71
|
+
rescue SystemExit
|
72
|
+
1.is 1 # we exited
|
73
|
+
end
|
74
|
+
end
|
75
|
+
lines = output.split("\n")
|
76
|
+
ok lines.include?("trypsin")
|
77
|
+
ok lines.include?("chymotrypsin")
|
78
|
+
end
|
79
|
+
end
|
80
|
+
|
81
|
+
describe 'reading a peptide centric database' do
|
82
|
+
outfiles = Ms::Ident::Peptide::Db.cmdline([FASTA_FILE])
|
83
|
+
@outfile = outfiles.first
|
84
|
+
|
85
|
+
it 'reads the file on disk with random access or is enumerable' do
|
86
|
+
Ms::Ident::Peptide::Db::IO.open(@outfile) do |io|
|
87
|
+
io["AVTEQGHELSNEER"].enums %w(sp|P31946|1433B_HUMAN sp|P31946-2|1433B_HUMAN)
|
88
|
+
io["VRAAR"].enums ["tr|D3DX18|D3DX18_HUMAN"]
|
89
|
+
io.each_with_index do |key_prots, i|
|
90
|
+
key_prots.first.isa String
|
91
|
+
key_prots.last.isa Array
|
92
|
+
end
|
93
|
+
end
|
94
|
+
end
|
95
|
+
end
|
@@ -0,0 +1,69 @@
|
|
1
|
+
require 'spec_helper'
|
2
|
+
|
3
|
+
require 'ms/ident/protein'
|
4
|
+
|
5
|
+
PeptideHit = Struct.new(:aaseq, :charge, :proteins) do
|
6
|
+
def inspect # easier to read output
|
7
|
+
"<PeptideHit aaseq=#{self.aaseq} charge=#{self.charge} proteins(ids)=#{self.proteins.map(&:id).join(',')}>"
|
8
|
+
end
|
9
|
+
def hash ; self.object_id end
|
10
|
+
end
|
11
|
+
ProteinHit = Struct.new(:id) do
|
12
|
+
def inspect # easier to read output
|
13
|
+
"<Prt #{self.id}>"
|
14
|
+
end
|
15
|
+
def hash ; self.object_id end
|
16
|
+
end
|
17
|
+
|
18
|
+
describe 'creating minimal protein groups from peptide hits' do
|
19
|
+
before do
|
20
|
+
@pep_hits = [ ['AABBCCDD', 2],
|
21
|
+
['BBCC', 2],
|
22
|
+
['DDEEFFGG', 2],
|
23
|
+
['DDEEFFGG', 3],
|
24
|
+
['HIYA', 2],
|
25
|
+
].map {|ar| PeptideHit.new(ar[0], ar[1], []) }
|
26
|
+
@prot_hits_hash = {
|
27
|
+
'big_guy' => @pep_hits,
|
28
|
+
'little_guy' => [@pep_hits.last],
|
29
|
+
'medium_guy1' => @pep_hits[0,4],
|
30
|
+
'medium_guy2' => @pep_hits[0,4],
|
31
|
+
'subsumed_by_medium' => @pep_hits[2,2],
|
32
|
+
}
|
33
|
+
@prot_hits = @prot_hits_hash.keys.map {|id| ProteinHit.new(id) }
|
34
|
+
end
|
35
|
+
|
36
|
+
it 'is a greedy algorithm' do
|
37
|
+
@prot_hits.each {|prthit| @prot_hits_hash[prthit.id].each {|pep| pep.proteins << prthit } }
|
38
|
+
# big_guy has all the peptides, so it takes them all
|
39
|
+
reply = Ms::Ident::Protein.peptide_hits_to_protein_groups(@pep_hits)
|
40
|
+
reply.first.size.is 2 # the group and the peptide set
|
41
|
+
reply.first.first.size.is 1 # the group
|
42
|
+
reply.first.first.first.id.is 'big_guy'
|
43
|
+
end
|
44
|
+
|
45
|
+
it 'removes proteins accounted for only as little pieces of larger proteins' do
|
46
|
+
@prot_hits[1..-1].each {|prthit| @prot_hits_hash[prthit.id].each {|pep| pep.proteins << prthit } }
|
47
|
+
reply = Ms::Ident::Protein.peptide_hits_to_protein_groups(@pep_hits)
|
48
|
+
# no subsumed_by_medium
|
49
|
+
reply.map(&:first).any? {|protein_list| protein_list.any? {|v| v.id == 'subsumed_by_medium' }}.is false
|
50
|
+
end
|
51
|
+
|
52
|
+
it 'allows alternate sorting algorithms for greediness' do
|
53
|
+
@prot_hits.each {|prthit| @prot_hits_hash[prthit.id].each {|pep| pep.proteins << prthit } }
|
54
|
+
reply = Ms::Ident::Protein.peptide_hits_to_protein_groups(@pep_hits) do |prot_and_peptide_hits|
|
55
|
+
# deliberate using a counterintuitive sorting method to give little guys
|
56
|
+
# a chance
|
57
|
+
-prot_and_peptide_hits.last.size
|
58
|
+
end
|
59
|
+
# because the little proteins are given priority, they 'survive'. Bigger
|
60
|
+
# proteins may also survive if they have at least one unique peptide
|
61
|
+
# to add to the mix. This demonstrates how proteins can be weighted in
|
62
|
+
# different ways based on their peptide hits.
|
63
|
+
seen = []
|
64
|
+
reply.each {|pair| pair.first.each {|prot| seen << prot.id } }
|
65
|
+
# big guy is completely accounted for in the now prioritized little guy
|
66
|
+
# and medium guys, etc.
|
67
|
+
seen.sort.is @prot_hits_hash.keys[1..-1].sort
|
68
|
+
end
|
69
|
+
end
|
data/spec/spec_helper.rb
CHANGED
@@ -1,18 +1,8 @@
|
|
1
1
|
require 'rubygems'
|
2
|
-
require 'bundler'
|
3
2
|
|
4
3
|
$spec_large = ENV['SPEC_LARGE']
|
5
|
-
development = $spec_large ? :development_large : :development
|
6
|
-
|
7
|
-
begin
|
8
|
-
Bundler.setup(:default, development)
|
9
|
-
rescue Bundler::BundlerError => e
|
10
|
-
$stderr.puts e.message
|
11
|
-
$stderr.puts "Run `bundle install` to install missing gems"
|
12
|
-
exit e.status_code
|
13
|
-
end
|
14
|
-
require 'spec/more'
|
15
4
|
|
5
|
+
require 'spec/more'
|
16
6
|
|
17
7
|
load_testdata = lambda do
|
18
8
|
require 'ms/testdata'
|
@@ -26,7 +16,6 @@ $LOAD_PATH.unshift(File.join(File.dirname(__FILE__), '..', 'lib'))
|
|
26
16
|
|
27
17
|
Bacon.summary_on_exit
|
28
18
|
|
29
|
-
|
30
19
|
def spec_large(&block)
|
31
20
|
if $spec_large
|
32
21
|
block.call
|
@@ -0,0 +1,69 @@
|
|
1
|
+
>sp|P31946|1433B_HUMAN 14-3-3 protein beta/alpha OS=Homo sapiens GN=YWHAB PE=1 SV=3
|
2
|
+
MTMDKSELVQKAKLAEQAERYDDMAAAMKAVTEQGHELSNEERNLLSVAYKNVVGARRSS
|
3
|
+
WRVISSIEQKTERNEKKQQMGKEYREKIEAELQDICNDVLELLDKYLIPNATQPESKVFY
|
4
|
+
LKMKGDYFRYLSEVASGDNKQTTVSNSQQAYQEAFEISKKEMQPTHPIRLGLALNFSVFY
|
5
|
+
YEILNSPEKACSLAKTAFDEAIAELDTLNEESYKDSTLIMQLLRDNLTLWTSENQGDEGD
|
6
|
+
AGEGEN
|
7
|
+
>sp|P31946-2|1433B_HUMAN Isoform Short of 14-3-3 protein beta/alpha OS=Homo sapiens GN=YWHAB
|
8
|
+
MDKSELVQKAKLAEQAERYDDMAAAMKAVTEQGHELSNEERNLLSVAYKNVVGARRSSWR
|
9
|
+
VISSIEQKTERNEKKQQMGKEYREKIEAELQDICNDVLELLDKYLIPNATQPESKVFYLK
|
10
|
+
MKGDYFRYLSEVASGDNKQTTVSNSQQAYQEAFEISKKEMQPTHPIRLGLALNFSVFYYE
|
11
|
+
ILNSPEKACSLAKTAFDEAIAELDTLNEESYKDSTLIMQLLRDNLTLWTSENQGDEGDAG
|
12
|
+
EGEN
|
13
|
+
>sp|P62258|1433E_HUMAN 14-3-3 protein epsilon OS=Homo sapiens GN=YWHAE PE=1 SV=1
|
14
|
+
MDDREDLVYQAKLAEQAERYDEMVESMKKVAGMDVELTVEERNLLSVAYKNVIGARRASW
|
15
|
+
RIISSIEQKEENKGGEDKLKMIREYRQMVETELKLICCDILDVLDKHLIPAANTGESKVF
|
16
|
+
YYKMKGDYHRYLAEFATGNDRKEAAENSLVAYKAASDIAMTELPPTHPIRLGLALNFSVF
|
17
|
+
YYEILNSPDRACRLAKAAFDDAIAELDTLSEESYKDSTLIMQLLRDNLTLWTSDMQGDGE
|
18
|
+
EQNKEALQDVEDENQ
|
19
|
+
>sp|Q04917|1433F_HUMAN 14-3-3 protein eta OS=Homo sapiens GN=YWHAH PE=1 SV=4
|
20
|
+
MGDREQLLQRARLAEQAERYDDMASAMKAVTELNEPLSNEDRNLLSVAYKNVVGARRSSW
|
21
|
+
RVISSIEQKTMADGNEKKLEKVKAYREKIEKELETVCNDVLSLLDKFLIKNCNDFQYESK
|
22
|
+
VFYLKMKGDYYRYLAEVASGEKKNSVVEASEAAYKEAFEISKEQMQPTHPIRLGLALNFS
|
23
|
+
VFYYEIQNAPEQACLLAKQAFDDAIAELDTLNEDSYKDSTLIMQLLRDNLTLWTSDQQDE
|
24
|
+
EAGEGN
|
25
|
+
>sp|P61981|1433G_HUMAN 14-3-3 protein gamma OS=Homo sapiens GN=YWHAG PE=1 SV=2
|
26
|
+
MVDREQLVQKARLAEQAERYDDMAAAMKNVTELNEPLSNEERNLLSVAYKNVVGARRSSW
|
27
|
+
RVISSIEQKTSADGNEKKIEMVRAYREKIEKELEAVCQDVLSLLDNYLIKNCSETQYESK
|
28
|
+
VFYLKMKGDYYRYLAEVATGEKRATVVESSEKAYSEAHEISKEHMQPTHPIRLGLALNYS
|
29
|
+
VFYYEIQNAPEQACHLAKTAFDDAIAELDTLNEDSYKDSTLIMQLLRDNLTLWTSDQQDD
|
30
|
+
DGGEGNN
|
31
|
+
>sp|P31947|1433S_HUMAN 14-3-3 protein sigma OS=Homo sapiens GN=SFN PE=1 SV=1
|
32
|
+
MERASLIQKAKLAEQAERYEDMAAFMKGAVEKGEELSCEERNLLSVAYKNVVGGQRAAWR
|
33
|
+
VLSSIEQKSNEEGSEEKGPEVREYREKVETELQGVCDTVLGLLDSHLIKEAGDAESRVFY
|
34
|
+
LKMKGDYYRYLAEVATGDDKKRIIDSARSAYQEAMDISKKEMPPTNPIRLGLALNFSVFH
|
35
|
+
YEIANSPEEAISLAKTTFDEAMADLHTLSEDSYKDSTLIMQLLRDNLTLWTADNAGEEGG
|
36
|
+
EAPQEPQS
|
37
|
+
>tr|D2KLI3|D2KLI3_HUMAN Cytochrome b OS=Homo sapiens GN=CYTB PE=3 SV=1
|
38
|
+
MTPTRKTNPLMKLINHSFIDLPTPSNISAWWNFGSLLGACLILQITTGLFLAMHYSPDAS
|
39
|
+
TAFSSIAHITRDVNYGWIIRYLHANGASMFFICLFLHIGRGLYYGSFLYSETWNIGIILL
|
40
|
+
LATMATAFMGYVLPWGQMSFWGATVITNLLSAIPYIGTDLVQWIWGGYSVDSPTLTRFFT
|
41
|
+
FHFILPFIIAALAALHLLFLHETGSNNPLGITSHSDKITFHPYYTIKDALGLLLFLLSLM
|
42
|
+
TLTLFSPDLLGDPDNYTLANPLNTPPHIKPEWYFLFAYTILRSVPNKLGGVLALLLSILI
|
43
|
+
LAMIPILHMSKQQSMMFRPLSQSLYWLLAADLLILTWIGGQPVSYPFTIIGQVASVLYFT
|
44
|
+
TILILMPTISLIENKMLKWA
|
45
|
+
>tr|D2KTA8|D2KTA8_HUMAN Putative uncharacterized protein FCAMR OS=Homo sapiens GN=FCAMR PE=4 SV=1
|
46
|
+
MDGEATVKPGEQVPLWTHGWPPDDPSPSFAAGSSFALPQKRPHPRWLWEGSLPSRTHLRA
|
47
|
+
MGTLRPSSPLCWREESSFAAPNSLKGSRLVSGEPGGAVTIQCHYAPSSVNRHQRKYWCRL
|
48
|
+
GPPRWICQTIVSTNQYTHHRYRDRVALTDFPQRGLFVVRLSQLSPDDIGCYLCGIGSENN
|
49
|
+
MLFLSMNLTISAGPASTLPTATPAAGELTMRSYGTASPVANRWTPGTTQTLGQGTAWDTV
|
50
|
+
ASTPGTSKTTASAEGRRTPGATRPAAPGTGSWAEGSVKAPAPIPESPPSKSRSMSNTTEG
|
51
|
+
VWEGTRSSVTNRARASKDRREMTTTKADRPREDIEGVRIALDAAKKVLGTIGPPALVSET
|
52
|
+
LAWEILPQATPVSKQQSQGSIGETTPAAGMWTLGTPAADVWILGTPAADVWTSMEAASGE
|
53
|
+
GSAAGDLDAATGDRGPQATLSQTPAVGPWGPPGKESSVKRTFPEDESSSRTLAPVSTMLA
|
54
|
+
LFMLMALVLLQRKLWRRRTSQEAERVTLIQMTHFLEVNPQADQLPHVERKMLQDDSLPAG
|
55
|
+
ASLTAPERNPGP
|
56
|
+
>tr|D2KTA9|D2KTA9_HUMAN Putative uncharacterized protein XKR5 OS=Homo sapiens GN=XKR5 PE=4 SV=1
|
57
|
+
MHARLLGLSALLQAAEQSARLYTVAYYFTTGRLLWGWLALAVLLPGFLVQALSYLWFRAD
|
58
|
+
GHPGHCSLMMLHLLQLGVWKRHWDAALTSLQKELEAPHRGWLQLQEADLSALRLLEALLQ
|
59
|
+
TGPHLLLQTYVFLASDFTDIVPGVSTLFSWSSLSWALVSYTRFMGFMKPGHLAMPWAALF
|
60
|
+
CQQLWRMGMLGTRVLSLVLFYKAYHFWVFVVAGAHWLVMTFWLVAQQSDIIDSTCHWRLF
|
61
|
+
NLLVGAVYILCYLSFWDSPSRNRMVTFYMVMLLENIILLLLATDFLQGASVDQPADHSWG
|
62
|
+
PVWISDWQCLTGNLLQPAASKIHRHLAGLPKEVLWHCRR
|
63
|
+
>tr|D3DSH8|D3DSH8_HUMAN HCG2036819, isoform CRA_a OS=Homo sapiens GN=hCG_2036819 PE=4 SV=1
|
64
|
+
MLGWIQPSRQPQLRAAPPTRTPSAKRCILCNFLPGCWLVGDVAGSRQPSAPQTLRQRQHT
|
65
|
+
RPPPQERGSGRRSPLREARRANPHFKSFPVLEARGLPCGARRTGPRRPVREMTLPSDPER
|
66
|
+
ATLPNPRLGAPAVPRRGPRSHGGRR
|
67
|
+
>tr|D3DX18|D3DX18_HUMAN Putative uncharacterized protein LOC128977 OS=Homo sapiens GN=LOC128977 PE=4 SV=1
|
68
|
+
MADGSGWQPPRPCEAYRAEWKLCRSARHFLHHYYVHGERPACEQWQRDLASCRDWEERRN
|
69
|
+
AEAQQSLCESERARVRAARKHILVWAPRQSPPPDWHLPLPQEKDE
|