mspire 0.8.4 → 0.8.5
Sign up to get free protection for your applications and to get access to all the features.
- data/README.md +1 -1
- data/VERSION +1 -1
- data/lib/mspire/fasta.rb +5 -27
- data/lib/mspire/ident/peptide/db/creator.rb +248 -0
- data/lib/mspire/ident/peptide/db/io.rb +62 -0
- data/lib/mspire/ident/peptide/db.rb +18 -225
- data/lib/mspire/ident/peptide_hit/qvalue.rb +3 -2
- data/schema/peptide_hit_qvalues.pqh.tsv +5 -0
- data/script/mascot_dat_to_peptide_hit_qvalues.rb +118 -0
- data/spec/mspire/ident/peptide/db/creator_spec.rb +65 -0
- data/spec/mspire/ident/peptide/db/io_spec.rb +21 -0
- data/spec/mspire/ident/peptide/db_spec.rb +7 -97
- data/spec/testfiles/mspire/ident/peptide/db/uni_11_sp_tr.PEPTIDE_CENTRIC.yml +728 -0
- data/spec/testfiles/mspire/ident/peptide/db/uni_11_sp_tr.msd_clvg2.min_aaseq4.yml +728 -728
- metadata +9 -2
@@ -0,0 +1,118 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
|
3
|
+
require 'trollop'
|
4
|
+
require 'set'
|
5
|
+
require 'mspire/ident/peptide_hit/qvalue'
|
6
|
+
require 'mspire/error_rate/qvalue'
|
7
|
+
|
8
|
+
begin
|
9
|
+
require 'mascot/dat'
|
10
|
+
rescue LoadError
|
11
|
+
puts "You need the mascot-dat gem for this to work!"
|
12
|
+
puts "AND IT MUST BE THE PRINCELAB GITHUB FORK until changes get incorporated upstream!"
|
13
|
+
puts "> gem install mascot-dat"
|
14
|
+
raise LoadError
|
15
|
+
end
|
16
|
+
raise "need princelab mascot-dat gem!" unless Mascot::DAT::VERSION == "0.3.1.1"
|
17
|
+
|
18
|
+
# target-decoy bundle
|
19
|
+
SearchBundle = Struct.new(:target, :decoy) do
|
20
|
+
# combines all bundles under self so that all targets are grouped and all
|
21
|
+
# decoys are grouped. returns self
|
22
|
+
def combine(bundles)
|
23
|
+
(targets, decoys) = bundles.map {|bundle| [bundle.target, bundle.decoy] }
|
24
|
+
.transpose.map {|ars| ars.reduce(:+) }
|
25
|
+
self
|
26
|
+
end
|
27
|
+
end
|
28
|
+
|
29
|
+
PSM = Struct.new(:search_id, :id, :aaseq, :charge, :score)
|
30
|
+
|
31
|
+
# turns 1+ into 1
|
32
|
+
def charge_string_to_charge(st)
|
33
|
+
md = st.match(/(\d)([\+\-])/)
|
34
|
+
i = md[1].to_i
|
35
|
+
i *= -1 if (md[2] == '-')
|
36
|
+
i
|
37
|
+
end
|
38
|
+
|
39
|
+
def read_mascot_dat_hits(dat_file)
|
40
|
+
filename =nil
|
41
|
+
IO.foreach(dat_file) do |line|
|
42
|
+
if line =~ /^FILE=(.*?).mgf/i
|
43
|
+
filename = $1.dup
|
44
|
+
break
|
45
|
+
end
|
46
|
+
end
|
47
|
+
dat = Mascot::DAT.open(dat_file)
|
48
|
+
|
49
|
+
data = [:peptides, :decoy_peptides].map do |mthd|
|
50
|
+
psms = []
|
51
|
+
dat.send(mthd).each do |psm|
|
52
|
+
next unless psm.query
|
53
|
+
query = dat.query(psm.query)
|
54
|
+
charge = charge_string_to_charge(query.charge)
|
55
|
+
psms << PSM.new(filename, query.title, psm.pep, charge, psm.score) if psm.score
|
56
|
+
end
|
57
|
+
psms
|
58
|
+
end
|
59
|
+
dat.close
|
60
|
+
SearchBundle.new(*data)
|
61
|
+
end
|
62
|
+
|
63
|
+
|
64
|
+
def putsv(*args)
|
65
|
+
puts(*args) if $VERBOSE
|
66
|
+
$stdout.flush
|
67
|
+
end
|
68
|
+
|
69
|
+
EXT = Mspire::Ident::PeptideHit::Qvalue::FILE_EXTENSION
|
70
|
+
combine_base = "combined"
|
71
|
+
|
72
|
+
opts = Trollop::Parser.new do
|
73
|
+
#banner %Q{usage: #{File.basename(__FILE__)} <target>.xml <decoy>.xml ...
|
74
|
+
banner %Q{usage: #{File.basename(__FILE__)} <mascot>.dat ...
|
75
|
+
outputs: <mascot>.phq.tsv
|
76
|
+
assumes a decoy search was run *with* the initial search
|
77
|
+
phq.tsv?: see schema/peptide_hit_qvalues.phq.tsv
|
78
|
+
}
|
79
|
+
opt :combine, "groups target and decoy hits together from all files, writing to #{combine_base}#{EXT}", :default => false
|
80
|
+
opt :z_together, "do not group by charge state", :default => false
|
81
|
+
opt :verbose, "be verbose", :default => false
|
82
|
+
end
|
83
|
+
|
84
|
+
opt = opts.parse(ARGV)
|
85
|
+
if ARGV.size == 0
|
86
|
+
opts.educate
|
87
|
+
exit
|
88
|
+
end
|
89
|
+
|
90
|
+
$VERBOSE = opt.delete(:verbose)
|
91
|
+
|
92
|
+
files = ARGV.to_a
|
93
|
+
|
94
|
+
bundles = files.map do |file|
|
95
|
+
# assumes the file has both target and decoy hits
|
96
|
+
read_mascot_dat_hits(file)
|
97
|
+
end
|
98
|
+
|
99
|
+
to_run = {}
|
100
|
+
if opt[:combine]
|
101
|
+
putsv "combining all target hits together and all decoy hits together"
|
102
|
+
bundle = SearchBundle.new.combine(bundles)
|
103
|
+
to_run[combine_base + EXT] = bundle
|
104
|
+
else
|
105
|
+
files.zip(bundles) do |file, bundle|
|
106
|
+
to_run[file.chomp(File.extname(file)) + EXT] = bundle
|
107
|
+
end
|
108
|
+
end
|
109
|
+
|
110
|
+
to_run.each do |file, bundle|
|
111
|
+
putsv "calculating qvalues for #{file}"
|
112
|
+
hit_qvalue_pairs = Mspire::ErrorRate::Qvalue.target_decoy_qvalues(bundle.target, bundle.decoy, :z_together => opt[:z_together])
|
113
|
+
# {|hit| hit.search_scores[:ionscore] }
|
114
|
+
outfile = Mspire::Ident::PeptideHit::Qvalue.to_file(file, *hit_qvalue_pairs.transpose)
|
115
|
+
putsv "created: #{outfile}"
|
116
|
+
end
|
117
|
+
|
118
|
+
|
@@ -0,0 +1,65 @@
|
|
1
|
+
require 'spec_helper'
|
2
|
+
|
3
|
+
require 'yaml'
|
4
|
+
path = 'mspire/ident/peptide/db'
|
5
|
+
require path + "/creator"
|
6
|
+
|
7
|
+
describe 'creating a peptide centric database' do
|
8
|
+
subject { Mspire::Ident::Peptide::Db::Creator.new }
|
9
|
+
|
10
|
+
describe 'amino acid expansion' do
|
11
|
+
|
12
|
+
it 'can expand out wildcard amino acid combinations' do
|
13
|
+
array = subject.expand_peptides('ALXX', 'X' => %w(* % &), 'L' => %w(P Q) )
|
14
|
+
array.sort.should == %w(AP** AP*% AP*& AP%* AP%% AP%& AP&* AP&% AP&& AQ** AQ*% AQ*& AQ%* AQ%% AQ%& AQ&* AQ&% AQ&&).sort
|
15
|
+
end
|
16
|
+
|
17
|
+
it 'will not expand explosive combinations (>MAX_NUM_AA_EXPANSION)' do
|
18
|
+
# this is from real data
|
19
|
+
worst_case = 'LTLLRPEKHEAATGVDTICTHRVDPIGPGLXXEXLYWELSXLTXXIXELGPYTLDR'
|
20
|
+
subject.expand_peptides(worst_case, 'X' => %w(* % &)).nil?.should == true
|
21
|
+
end
|
22
|
+
|
23
|
+
it 'returns the peptide in the array if no expansion' do
|
24
|
+
array = subject.expand_peptides('ZZZZZ', 'X' => %w(* % &), 'L' => %w(P Q) )
|
25
|
+
array.should == ['ZZZZZ']
|
26
|
+
end
|
27
|
+
end
|
28
|
+
|
29
|
+
describe 'the commandline utility' do
|
30
|
+
|
31
|
+
before do
|
32
|
+
@fasta_file = [TESTFILES, path, 'uni_11_sp_tr.fasta'].join('/')
|
33
|
+
@output_file = [TESTFILES, path, "uni_11_sp_tr.msd_clvg2.min_aaseq4.yml"].join('/')
|
34
|
+
end
|
35
|
+
|
36
|
+
it 'converts a fasta file into peptide centric db' do
|
37
|
+
output_files = Mspire::Ident::Peptide::Db::Creator.cmdline([@fasta_file])
|
38
|
+
output_files.first.should == File.expand_path(@output_file)
|
39
|
+
File.exist?(@output_file).should == true
|
40
|
+
hash = {}
|
41
|
+
YAML.load_file(@output_file).each do |k,v|
|
42
|
+
hash[k] = v.split("\t")
|
43
|
+
end
|
44
|
+
sorted = hash.sort
|
45
|
+
# these are merely frozen, not perfectly defined
|
46
|
+
sorted.first.should == ["AAFDDAIAELDTLSEESYK", ["P62258"]]
|
47
|
+
sorted.last.should == ["YWCRLGPPRWICQTIVSTNQYTHHR", ["D2KTA8"]]
|
48
|
+
sorted.size.should == 728
|
49
|
+
#File.unlink(@output_file)
|
50
|
+
end
|
51
|
+
|
52
|
+
it 'lists approved enzymes and exits' do
|
53
|
+
output = capture_stdout do
|
54
|
+
begin
|
55
|
+
Mspire::Ident::Peptide::Db::Creator.cmdline(['--list-enzymes'])
|
56
|
+
rescue SystemExit
|
57
|
+
1.should == 1 # we exited
|
58
|
+
end
|
59
|
+
end
|
60
|
+
lines = output.split("\n")
|
61
|
+
lines.include?("trypsin").should == true
|
62
|
+
lines.include?("chymotrypsin").should == true
|
63
|
+
end
|
64
|
+
end
|
65
|
+
end
|
@@ -0,0 +1,21 @@
|
|
1
|
+
require 'spec_helper'
|
2
|
+
|
3
|
+
require 'mspire/ident/peptide/db/io'
|
4
|
+
|
5
|
+
describe 'reading a peptide centric DB' do
|
6
|
+
before do
|
7
|
+
@pepcentric = TESTFILES + "/mspire/ident/peptide/db/uni_11_sp_tr.PEPTIDE_CENTRIC.yml"
|
8
|
+
end
|
9
|
+
|
10
|
+
it 'reads the file on disk with random access or is enumerable' do
|
11
|
+
Mspire::Ident::Peptide::Db::IO.open(@pepcentric) do |io|
|
12
|
+
io["AVTEQGHELSNEER"].should == %w(sp|P31946|1433B_HUMAN sp|P31946-2|1433B_HUMAN)
|
13
|
+
io["VRAAR"].should == ["tr|D3DX18|D3DX18_HUMAN"]
|
14
|
+
io["SILLY WILLY"].should be_nil
|
15
|
+
io.each_with_index do |key_prots, i|
|
16
|
+
key_prots.first.should be_an_instance_of String
|
17
|
+
key_prots.last.should be_a_kind_of Array
|
18
|
+
end
|
19
|
+
end
|
20
|
+
end
|
21
|
+
end
|
@@ -4,105 +4,15 @@ require 'yaml'
|
|
4
4
|
path = 'mspire/ident/peptide/db'
|
5
5
|
require path
|
6
6
|
|
7
|
-
|
8
|
-
|
9
|
-
def capture_stdout
|
10
|
-
out = StringIO.new
|
11
|
-
$stdout = out
|
12
|
-
yield
|
13
|
-
out.rewind
|
14
|
-
return out.read
|
15
|
-
ensure
|
16
|
-
$stdout = STDOUT
|
17
|
-
end
|
18
|
-
|
19
|
-
end
|
20
|
-
|
21
|
-
|
22
|
-
describe 'a uniprot fasta file' do
|
23
|
-
|
7
|
+
describe 'reading a peptide centric db' do
|
24
8
|
before do
|
25
|
-
@
|
26
|
-
end
|
27
|
-
|
28
|
-
describe 'amino acid expansion' do
|
29
|
-
|
30
|
-
it 'can expand out wildcard amino acid combinations' do
|
31
|
-
array = Mspire::Ident::Peptide::Db.expand_peptides('ALXX', 'X' => %w(* % &), 'L' => %w(P Q) )
|
32
|
-
array.sort.should == %w(AP** AP*% AP*& AP%* AP%% AP%& AP&* AP&% AP&& AQ** AQ*% AQ*& AQ%* AQ%% AQ%& AQ&* AQ&% AQ&&).sort
|
33
|
-
end
|
34
|
-
|
35
|
-
it 'will not expand explosive combinations (>MAX_NUM_AA_EXPANSION)' do
|
36
|
-
# this is from real data
|
37
|
-
worst_case = 'LTLLRPEKHEAATGVDTICTHRVDPIGPGLXXEXLYWELSXLTXXIXELGPYTLDR'
|
38
|
-
Mspire::Ident::Peptide::Db.expand_peptides(worst_case, 'X' => %w(* % &)).nil?.should == true
|
39
|
-
end
|
40
|
-
|
41
|
-
it 'returns the peptide in the array if no expansion' do
|
42
|
-
array = Mspire::Ident::Peptide::Db.expand_peptides('ZZZZZ', 'X' => %w(* % &), 'L' => %w(P Q) )
|
43
|
-
array.should == ['ZZZZZ']
|
44
|
-
end
|
45
|
-
|
9
|
+
@pepcentric = TESTFILES + "/mspire/ident/peptide/db/uni_11_sp_tr.PEPTIDE_CENTRIC.yml"
|
46
10
|
end
|
47
11
|
|
48
|
-
|
49
|
-
|
50
|
-
|
51
|
-
|
52
|
-
|
53
|
-
end
|
54
|
-
|
55
|
-
it 'converts a fasta file into peptide centric db' do
|
56
|
-
output_files = Mspire::Ident::Peptide::Db.cmdline([@fasta_file])
|
57
|
-
output_files.first.should == File.expand_path(@output_file)
|
58
|
-
File.exist?(@output_file).should == true
|
59
|
-
hash = {}
|
60
|
-
YAML.load_file(@output_file).each do |k,v|
|
61
|
-
hash[k] = v.split("\t")
|
62
|
-
end
|
63
|
-
sorted = hash.sort
|
64
|
-
# these are merely frozen, not perfectly defined
|
65
|
-
sorted.first.should == ["AAFDDAIAELDTLSEESYK", ["sp|P62258|1433E_HUMAN"]]
|
66
|
-
sorted.last.should == ["YWCRLGPPRWICQTIVSTNQYTHHR", ["tr|D2KTA8|D2KTA8_HUMAN"]]
|
67
|
-
sorted.size.should == 728
|
68
|
-
File.unlink(@output_file)
|
69
|
-
end
|
70
|
-
|
71
|
-
it 'lists approved enzymes and exits' do
|
72
|
-
output = capture_stdout do
|
73
|
-
begin
|
74
|
-
Mspire::Ident::Peptide::Db.cmdline(['--list-enzymes'])
|
75
|
-
rescue SystemExit
|
76
|
-
1.should == 1 # we exited
|
77
|
-
end
|
78
|
-
end
|
79
|
-
lines = output.split("\n")
|
80
|
-
lines.include?("trypsin").should == true
|
81
|
-
lines.include?("chymotrypsin").should == true
|
82
|
-
end
|
83
|
-
end
|
84
|
-
|
85
|
-
describe 'reading a peptide centric database' do
|
86
|
-
before do
|
87
|
-
outfiles = Mspire::Ident::Peptide::Db.cmdline([@fasta_file])
|
88
|
-
@outfile = outfiles.first
|
89
|
-
end
|
90
|
-
|
91
|
-
it 'creates a hash that can retrieve peptides as an array' do
|
92
|
-
hash = Mspire::Ident::Peptide::Db.new(@outfile)
|
93
|
-
hash["AVTEQGHELSNEER"].should == %w(sp|P31946|1433B_HUMAN sp|P31946-2|1433B_HUMAN)
|
94
|
-
hash["VRAAR"].should == ["tr|D3DX18|D3DX18_HUMAN"]
|
95
|
-
end
|
96
|
-
|
97
|
-
it 'reads the file on disk with random access or is enumerable' do
|
98
|
-
Mspire::Ident::Peptide::Db::IO.open(@outfile) do |io|
|
99
|
-
io["AVTEQGHELSNEER"].should == %w(sp|P31946|1433B_HUMAN sp|P31946-2|1433B_HUMAN)
|
100
|
-
io["VRAAR"].should == ["tr|D3DX18|D3DX18_HUMAN"]
|
101
|
-
io.each_with_index do |key_prots, i|
|
102
|
-
key_prots.first.should be_an_instance_of String
|
103
|
-
key_prots.last.should be_a_kind_of Array
|
104
|
-
end
|
105
|
-
end
|
106
|
-
end
|
12
|
+
it 'creates a hash that can retrieve peptides as an array' do
|
13
|
+
hash = Mspire::Ident::Peptide::Db.new(@pepcentric)
|
14
|
+
hash["AVTEQGHELSNEER"].should == ["sp|P31946|1433B_HUMAN", "sp|P31946-2|1433B_HUMAN"]
|
15
|
+
hash["VRAAR"].should == ["tr|D3DX18|D3DX18_HUMAN"]
|
16
|
+
hash["BANNANA"].should == nil
|
107
17
|
end
|
108
18
|
end
|