mspire 0.8.4 → 0.8.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/README.md +1 -1
- data/VERSION +1 -1
- data/lib/mspire/fasta.rb +5 -27
- data/lib/mspire/ident/peptide/db/creator.rb +248 -0
- data/lib/mspire/ident/peptide/db/io.rb +62 -0
- data/lib/mspire/ident/peptide/db.rb +18 -225
- data/lib/mspire/ident/peptide_hit/qvalue.rb +3 -2
- data/schema/peptide_hit_qvalues.pqh.tsv +5 -0
- data/script/mascot_dat_to_peptide_hit_qvalues.rb +118 -0
- data/spec/mspire/ident/peptide/db/creator_spec.rb +65 -0
- data/spec/mspire/ident/peptide/db/io_spec.rb +21 -0
- data/spec/mspire/ident/peptide/db_spec.rb +7 -97
- data/spec/testfiles/mspire/ident/peptide/db/uni_11_sp_tr.PEPTIDE_CENTRIC.yml +728 -0
- data/spec/testfiles/mspire/ident/peptide/db/uni_11_sp_tr.msd_clvg2.min_aaseq4.yml +728 -728
- metadata +9 -2
@@ -0,0 +1,118 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
|
3
|
+
require 'trollop'
|
4
|
+
require 'set'
|
5
|
+
require 'mspire/ident/peptide_hit/qvalue'
|
6
|
+
require 'mspire/error_rate/qvalue'
|
7
|
+
|
8
|
+
begin
|
9
|
+
require 'mascot/dat'
|
10
|
+
rescue LoadError
|
11
|
+
puts "You need the mascot-dat gem for this to work!"
|
12
|
+
puts "AND IT MUST BE THE PRINCELAB GITHUB FORK until changes get incorporated upstream!"
|
13
|
+
puts "> gem install mascot-dat"
|
14
|
+
raise LoadError
|
15
|
+
end
|
16
|
+
raise "need princelab mascot-dat gem!" unless Mascot::DAT::VERSION == "0.3.1.1"
|
17
|
+
|
18
|
+
# target-decoy bundle
|
19
|
+
SearchBundle = Struct.new(:target, :decoy) do
|
20
|
+
# combines all bundles under self so that all targets are grouped and all
|
21
|
+
# decoys are grouped. returns self
|
22
|
+
def combine(bundles)
|
23
|
+
(targets, decoys) = bundles.map {|bundle| [bundle.target, bundle.decoy] }
|
24
|
+
.transpose.map {|ars| ars.reduce(:+) }
|
25
|
+
self
|
26
|
+
end
|
27
|
+
end
|
28
|
+
|
29
|
+
PSM = Struct.new(:search_id, :id, :aaseq, :charge, :score)
|
30
|
+
|
31
|
+
# turns 1+ into 1
|
32
|
+
def charge_string_to_charge(st)
|
33
|
+
md = st.match(/(\d)([\+\-])/)
|
34
|
+
i = md[1].to_i
|
35
|
+
i *= -1 if (md[2] == '-')
|
36
|
+
i
|
37
|
+
end
|
38
|
+
|
39
|
+
def read_mascot_dat_hits(dat_file)
|
40
|
+
filename =nil
|
41
|
+
IO.foreach(dat_file) do |line|
|
42
|
+
if line =~ /^FILE=(.*?).mgf/i
|
43
|
+
filename = $1.dup
|
44
|
+
break
|
45
|
+
end
|
46
|
+
end
|
47
|
+
dat = Mascot::DAT.open(dat_file)
|
48
|
+
|
49
|
+
data = [:peptides, :decoy_peptides].map do |mthd|
|
50
|
+
psms = []
|
51
|
+
dat.send(mthd).each do |psm|
|
52
|
+
next unless psm.query
|
53
|
+
query = dat.query(psm.query)
|
54
|
+
charge = charge_string_to_charge(query.charge)
|
55
|
+
psms << PSM.new(filename, query.title, psm.pep, charge, psm.score) if psm.score
|
56
|
+
end
|
57
|
+
psms
|
58
|
+
end
|
59
|
+
dat.close
|
60
|
+
SearchBundle.new(*data)
|
61
|
+
end
|
62
|
+
|
63
|
+
|
64
|
+
def putsv(*args)
|
65
|
+
puts(*args) if $VERBOSE
|
66
|
+
$stdout.flush
|
67
|
+
end
|
68
|
+
|
69
|
+
EXT = Mspire::Ident::PeptideHit::Qvalue::FILE_EXTENSION
|
70
|
+
combine_base = "combined"
|
71
|
+
|
72
|
+
opts = Trollop::Parser.new do
|
73
|
+
#banner %Q{usage: #{File.basename(__FILE__)} <target>.xml <decoy>.xml ...
|
74
|
+
banner %Q{usage: #{File.basename(__FILE__)} <mascot>.dat ...
|
75
|
+
outputs: <mascot>.phq.tsv
|
76
|
+
assumes a decoy search was run *with* the initial search
|
77
|
+
phq.tsv?: see schema/peptide_hit_qvalues.phq.tsv
|
78
|
+
}
|
79
|
+
opt :combine, "groups target and decoy hits together from all files, writing to #{combine_base}#{EXT}", :default => false
|
80
|
+
opt :z_together, "do not group by charge state", :default => false
|
81
|
+
opt :verbose, "be verbose", :default => false
|
82
|
+
end
|
83
|
+
|
84
|
+
opt = opts.parse(ARGV)
|
85
|
+
if ARGV.size == 0
|
86
|
+
opts.educate
|
87
|
+
exit
|
88
|
+
end
|
89
|
+
|
90
|
+
$VERBOSE = opt.delete(:verbose)
|
91
|
+
|
92
|
+
files = ARGV.to_a
|
93
|
+
|
94
|
+
bundles = files.map do |file|
|
95
|
+
# assumes the file has both target and decoy hits
|
96
|
+
read_mascot_dat_hits(file)
|
97
|
+
end
|
98
|
+
|
99
|
+
to_run = {}
|
100
|
+
if opt[:combine]
|
101
|
+
putsv "combining all target hits together and all decoy hits together"
|
102
|
+
bundle = SearchBundle.new.combine(bundles)
|
103
|
+
to_run[combine_base + EXT] = bundle
|
104
|
+
else
|
105
|
+
files.zip(bundles) do |file, bundle|
|
106
|
+
to_run[file.chomp(File.extname(file)) + EXT] = bundle
|
107
|
+
end
|
108
|
+
end
|
109
|
+
|
110
|
+
to_run.each do |file, bundle|
|
111
|
+
putsv "calculating qvalues for #{file}"
|
112
|
+
hit_qvalue_pairs = Mspire::ErrorRate::Qvalue.target_decoy_qvalues(bundle.target, bundle.decoy, :z_together => opt[:z_together])
|
113
|
+
# {|hit| hit.search_scores[:ionscore] }
|
114
|
+
outfile = Mspire::Ident::PeptideHit::Qvalue.to_file(file, *hit_qvalue_pairs.transpose)
|
115
|
+
putsv "created: #{outfile}"
|
116
|
+
end
|
117
|
+
|
118
|
+
|
@@ -0,0 +1,65 @@
|
|
1
|
+
require 'spec_helper'
|
2
|
+
|
3
|
+
require 'yaml'
|
4
|
+
path = 'mspire/ident/peptide/db'
|
5
|
+
require path + "/creator"
|
6
|
+
|
7
|
+
describe 'creating a peptide centric database' do
|
8
|
+
subject { Mspire::Ident::Peptide::Db::Creator.new }
|
9
|
+
|
10
|
+
describe 'amino acid expansion' do
|
11
|
+
|
12
|
+
it 'can expand out wildcard amino acid combinations' do
|
13
|
+
array = subject.expand_peptides('ALXX', 'X' => %w(* % &), 'L' => %w(P Q) )
|
14
|
+
array.sort.should == %w(AP** AP*% AP*& AP%* AP%% AP%& AP&* AP&% AP&& AQ** AQ*% AQ*& AQ%* AQ%% AQ%& AQ&* AQ&% AQ&&).sort
|
15
|
+
end
|
16
|
+
|
17
|
+
it 'will not expand explosive combinations (>MAX_NUM_AA_EXPANSION)' do
|
18
|
+
# this is from real data
|
19
|
+
worst_case = 'LTLLRPEKHEAATGVDTICTHRVDPIGPGLXXEXLYWELSXLTXXIXELGPYTLDR'
|
20
|
+
subject.expand_peptides(worst_case, 'X' => %w(* % &)).nil?.should == true
|
21
|
+
end
|
22
|
+
|
23
|
+
it 'returns the peptide in the array if no expansion' do
|
24
|
+
array = subject.expand_peptides('ZZZZZ', 'X' => %w(* % &), 'L' => %w(P Q) )
|
25
|
+
array.should == ['ZZZZZ']
|
26
|
+
end
|
27
|
+
end
|
28
|
+
|
29
|
+
describe 'the commandline utility' do
|
30
|
+
|
31
|
+
before do
|
32
|
+
@fasta_file = [TESTFILES, path, 'uni_11_sp_tr.fasta'].join('/')
|
33
|
+
@output_file = [TESTFILES, path, "uni_11_sp_tr.msd_clvg2.min_aaseq4.yml"].join('/')
|
34
|
+
end
|
35
|
+
|
36
|
+
it 'converts a fasta file into peptide centric db' do
|
37
|
+
output_files = Mspire::Ident::Peptide::Db::Creator.cmdline([@fasta_file])
|
38
|
+
output_files.first.should == File.expand_path(@output_file)
|
39
|
+
File.exist?(@output_file).should == true
|
40
|
+
hash = {}
|
41
|
+
YAML.load_file(@output_file).each do |k,v|
|
42
|
+
hash[k] = v.split("\t")
|
43
|
+
end
|
44
|
+
sorted = hash.sort
|
45
|
+
# these are merely frozen, not perfectly defined
|
46
|
+
sorted.first.should == ["AAFDDAIAELDTLSEESYK", ["P62258"]]
|
47
|
+
sorted.last.should == ["YWCRLGPPRWICQTIVSTNQYTHHR", ["D2KTA8"]]
|
48
|
+
sorted.size.should == 728
|
49
|
+
#File.unlink(@output_file)
|
50
|
+
end
|
51
|
+
|
52
|
+
it 'lists approved enzymes and exits' do
|
53
|
+
output = capture_stdout do
|
54
|
+
begin
|
55
|
+
Mspire::Ident::Peptide::Db::Creator.cmdline(['--list-enzymes'])
|
56
|
+
rescue SystemExit
|
57
|
+
1.should == 1 # we exited
|
58
|
+
end
|
59
|
+
end
|
60
|
+
lines = output.split("\n")
|
61
|
+
lines.include?("trypsin").should == true
|
62
|
+
lines.include?("chymotrypsin").should == true
|
63
|
+
end
|
64
|
+
end
|
65
|
+
end
|
@@ -0,0 +1,21 @@
|
|
1
|
+
require 'spec_helper'
|
2
|
+
|
3
|
+
require 'mspire/ident/peptide/db/io'
|
4
|
+
|
5
|
+
describe 'reading a peptide centric DB' do
|
6
|
+
before do
|
7
|
+
@pepcentric = TESTFILES + "/mspire/ident/peptide/db/uni_11_sp_tr.PEPTIDE_CENTRIC.yml"
|
8
|
+
end
|
9
|
+
|
10
|
+
it 'reads the file on disk with random access or is enumerable' do
|
11
|
+
Mspire::Ident::Peptide::Db::IO.open(@pepcentric) do |io|
|
12
|
+
io["AVTEQGHELSNEER"].should == %w(sp|P31946|1433B_HUMAN sp|P31946-2|1433B_HUMAN)
|
13
|
+
io["VRAAR"].should == ["tr|D3DX18|D3DX18_HUMAN"]
|
14
|
+
io["SILLY WILLY"].should be_nil
|
15
|
+
io.each_with_index do |key_prots, i|
|
16
|
+
key_prots.first.should be_an_instance_of String
|
17
|
+
key_prots.last.should be_a_kind_of Array
|
18
|
+
end
|
19
|
+
end
|
20
|
+
end
|
21
|
+
end
|
@@ -4,105 +4,15 @@ require 'yaml'
|
|
4
4
|
path = 'mspire/ident/peptide/db'
|
5
5
|
require path
|
6
6
|
|
7
|
-
|
8
|
-
|
9
|
-
def capture_stdout
|
10
|
-
out = StringIO.new
|
11
|
-
$stdout = out
|
12
|
-
yield
|
13
|
-
out.rewind
|
14
|
-
return out.read
|
15
|
-
ensure
|
16
|
-
$stdout = STDOUT
|
17
|
-
end
|
18
|
-
|
19
|
-
end
|
20
|
-
|
21
|
-
|
22
|
-
describe 'a uniprot fasta file' do
|
23
|
-
|
7
|
+
describe 'reading a peptide centric db' do
|
24
8
|
before do
|
25
|
-
@
|
26
|
-
end
|
27
|
-
|
28
|
-
describe 'amino acid expansion' do
|
29
|
-
|
30
|
-
it 'can expand out wildcard amino acid combinations' do
|
31
|
-
array = Mspire::Ident::Peptide::Db.expand_peptides('ALXX', 'X' => %w(* % &), 'L' => %w(P Q) )
|
32
|
-
array.sort.should == %w(AP** AP*% AP*& AP%* AP%% AP%& AP&* AP&% AP&& AQ** AQ*% AQ*& AQ%* AQ%% AQ%& AQ&* AQ&% AQ&&).sort
|
33
|
-
end
|
34
|
-
|
35
|
-
it 'will not expand explosive combinations (>MAX_NUM_AA_EXPANSION)' do
|
36
|
-
# this is from real data
|
37
|
-
worst_case = 'LTLLRPEKHEAATGVDTICTHRVDPIGPGLXXEXLYWELSXLTXXIXELGPYTLDR'
|
38
|
-
Mspire::Ident::Peptide::Db.expand_peptides(worst_case, 'X' => %w(* % &)).nil?.should == true
|
39
|
-
end
|
40
|
-
|
41
|
-
it 'returns the peptide in the array if no expansion' do
|
42
|
-
array = Mspire::Ident::Peptide::Db.expand_peptides('ZZZZZ', 'X' => %w(* % &), 'L' => %w(P Q) )
|
43
|
-
array.should == ['ZZZZZ']
|
44
|
-
end
|
45
|
-
|
9
|
+
@pepcentric = TESTFILES + "/mspire/ident/peptide/db/uni_11_sp_tr.PEPTIDE_CENTRIC.yml"
|
46
10
|
end
|
47
11
|
|
48
|
-
|
49
|
-
|
50
|
-
|
51
|
-
|
52
|
-
|
53
|
-
end
|
54
|
-
|
55
|
-
it 'converts a fasta file into peptide centric db' do
|
56
|
-
output_files = Mspire::Ident::Peptide::Db.cmdline([@fasta_file])
|
57
|
-
output_files.first.should == File.expand_path(@output_file)
|
58
|
-
File.exist?(@output_file).should == true
|
59
|
-
hash = {}
|
60
|
-
YAML.load_file(@output_file).each do |k,v|
|
61
|
-
hash[k] = v.split("\t")
|
62
|
-
end
|
63
|
-
sorted = hash.sort
|
64
|
-
# these are merely frozen, not perfectly defined
|
65
|
-
sorted.first.should == ["AAFDDAIAELDTLSEESYK", ["sp|P62258|1433E_HUMAN"]]
|
66
|
-
sorted.last.should == ["YWCRLGPPRWICQTIVSTNQYTHHR", ["tr|D2KTA8|D2KTA8_HUMAN"]]
|
67
|
-
sorted.size.should == 728
|
68
|
-
File.unlink(@output_file)
|
69
|
-
end
|
70
|
-
|
71
|
-
it 'lists approved enzymes and exits' do
|
72
|
-
output = capture_stdout do
|
73
|
-
begin
|
74
|
-
Mspire::Ident::Peptide::Db.cmdline(['--list-enzymes'])
|
75
|
-
rescue SystemExit
|
76
|
-
1.should == 1 # we exited
|
77
|
-
end
|
78
|
-
end
|
79
|
-
lines = output.split("\n")
|
80
|
-
lines.include?("trypsin").should == true
|
81
|
-
lines.include?("chymotrypsin").should == true
|
82
|
-
end
|
83
|
-
end
|
84
|
-
|
85
|
-
describe 'reading a peptide centric database' do
|
86
|
-
before do
|
87
|
-
outfiles = Mspire::Ident::Peptide::Db.cmdline([@fasta_file])
|
88
|
-
@outfile = outfiles.first
|
89
|
-
end
|
90
|
-
|
91
|
-
it 'creates a hash that can retrieve peptides as an array' do
|
92
|
-
hash = Mspire::Ident::Peptide::Db.new(@outfile)
|
93
|
-
hash["AVTEQGHELSNEER"].should == %w(sp|P31946|1433B_HUMAN sp|P31946-2|1433B_HUMAN)
|
94
|
-
hash["VRAAR"].should == ["tr|D3DX18|D3DX18_HUMAN"]
|
95
|
-
end
|
96
|
-
|
97
|
-
it 'reads the file on disk with random access or is enumerable' do
|
98
|
-
Mspire::Ident::Peptide::Db::IO.open(@outfile) do |io|
|
99
|
-
io["AVTEQGHELSNEER"].should == %w(sp|P31946|1433B_HUMAN sp|P31946-2|1433B_HUMAN)
|
100
|
-
io["VRAAR"].should == ["tr|D3DX18|D3DX18_HUMAN"]
|
101
|
-
io.each_with_index do |key_prots, i|
|
102
|
-
key_prots.first.should be_an_instance_of String
|
103
|
-
key_prots.last.should be_a_kind_of Array
|
104
|
-
end
|
105
|
-
end
|
106
|
-
end
|
12
|
+
it 'creates a hash that can retrieve peptides as an array' do
|
13
|
+
hash = Mspire::Ident::Peptide::Db.new(@pepcentric)
|
14
|
+
hash["AVTEQGHELSNEER"].should == ["sp|P31946|1433B_HUMAN", "sp|P31946-2|1433B_HUMAN"]
|
15
|
+
hash["VRAAR"].should == ["tr|D3DX18|D3DX18_HUMAN"]
|
16
|
+
hash["BANNANA"].should == nil
|
107
17
|
end
|
108
18
|
end
|