mspire 0.8.4 → 0.8.5

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,118 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require 'trollop'
4
+ require 'set'
5
+ require 'mspire/ident/peptide_hit/qvalue'
6
+ require 'mspire/error_rate/qvalue'
7
+
8
+ begin
9
+ require 'mascot/dat'
10
+ rescue LoadError
11
+ puts "You need the mascot-dat gem for this to work!"
12
+ puts "AND IT MUST BE THE PRINCELAB GITHUB FORK until changes get incorporated upstream!"
13
+ puts "> gem install mascot-dat"
14
+ raise LoadError
15
+ end
16
+ raise "need princelab mascot-dat gem!" unless Mascot::DAT::VERSION == "0.3.1.1"
17
+
18
+ # target-decoy bundle
19
+ SearchBundle = Struct.new(:target, :decoy) do
20
+ # combines all bundles under self so that all targets are grouped and all
21
+ # decoys are grouped. returns self
22
+ def combine(bundles)
23
+ (targets, decoys) = bundles.map {|bundle| [bundle.target, bundle.decoy] }
24
+ .transpose.map {|ars| ars.reduce(:+) }
25
+ self
26
+ end
27
+ end
28
+
29
+ PSM = Struct.new(:search_id, :id, :aaseq, :charge, :score)
30
+
31
+ # turns 1+ into 1
32
+ def charge_string_to_charge(st)
33
+ md = st.match(/(\d)([\+\-])/)
34
+ i = md[1].to_i
35
+ i *= -1 if (md[2] == '-')
36
+ i
37
+ end
38
+
39
+ def read_mascot_dat_hits(dat_file)
40
+ filename =nil
41
+ IO.foreach(dat_file) do |line|
42
+ if line =~ /^FILE=(.*?).mgf/i
43
+ filename = $1.dup
44
+ break
45
+ end
46
+ end
47
+ dat = Mascot::DAT.open(dat_file)
48
+
49
+ data = [:peptides, :decoy_peptides].map do |mthd|
50
+ psms = []
51
+ dat.send(mthd).each do |psm|
52
+ next unless psm.query
53
+ query = dat.query(psm.query)
54
+ charge = charge_string_to_charge(query.charge)
55
+ psms << PSM.new(filename, query.title, psm.pep, charge, psm.score) if psm.score
56
+ end
57
+ psms
58
+ end
59
+ dat.close
60
+ SearchBundle.new(*data)
61
+ end
62
+
63
+
64
+ def putsv(*args)
65
+ puts(*args) if $VERBOSE
66
+ $stdout.flush
67
+ end
68
+
69
+ EXT = Mspire::Ident::PeptideHit::Qvalue::FILE_EXTENSION
70
+ combine_base = "combined"
71
+
72
+ opts = Trollop::Parser.new do
73
+ #banner %Q{usage: #{File.basename(__FILE__)} <target>.xml <decoy>.xml ...
74
+ banner %Q{usage: #{File.basename(__FILE__)} <mascot>.dat ...
75
+ outputs: <mascot>.phq.tsv
76
+ assumes a decoy search was run *with* the initial search
77
+ phq.tsv?: see schema/peptide_hit_qvalues.phq.tsv
78
+ }
79
+ opt :combine, "groups target and decoy hits together from all files, writing to #{combine_base}#{EXT}", :default => false
80
+ opt :z_together, "do not group by charge state", :default => false
81
+ opt :verbose, "be verbose", :default => false
82
+ end
83
+
84
+ opt = opts.parse(ARGV)
85
+ if ARGV.size == 0
86
+ opts.educate
87
+ exit
88
+ end
89
+
90
+ $VERBOSE = opt.delete(:verbose)
91
+
92
+ files = ARGV.to_a
93
+
94
+ bundles = files.map do |file|
95
+ # assumes the file has both target and decoy hits
96
+ read_mascot_dat_hits(file)
97
+ end
98
+
99
+ to_run = {}
100
+ if opt[:combine]
101
+ putsv "combining all target hits together and all decoy hits together"
102
+ bundle = SearchBundle.new.combine(bundles)
103
+ to_run[combine_base + EXT] = bundle
104
+ else
105
+ files.zip(bundles) do |file, bundle|
106
+ to_run[file.chomp(File.extname(file)) + EXT] = bundle
107
+ end
108
+ end
109
+
110
+ to_run.each do |file, bundle|
111
+ putsv "calculating qvalues for #{file}"
112
+ hit_qvalue_pairs = Mspire::ErrorRate::Qvalue.target_decoy_qvalues(bundle.target, bundle.decoy, :z_together => opt[:z_together])
113
+ # {|hit| hit.search_scores[:ionscore] }
114
+ outfile = Mspire::Ident::PeptideHit::Qvalue.to_file(file, *hit_qvalue_pairs.transpose)
115
+ putsv "created: #{outfile}"
116
+ end
117
+
118
+
@@ -0,0 +1,65 @@
1
+ require 'spec_helper'
2
+
3
+ require 'yaml'
4
+ path = 'mspire/ident/peptide/db'
5
+ require path + "/creator"
6
+
7
+ describe 'creating a peptide centric database' do
8
+ subject { Mspire::Ident::Peptide::Db::Creator.new }
9
+
10
+ describe 'amino acid expansion' do
11
+
12
+ it 'can expand out wildcard amino acid combinations' do
13
+ array = subject.expand_peptides('ALXX', 'X' => %w(* % &), 'L' => %w(P Q) )
14
+ array.sort.should == %w(AP** AP*% AP*& AP%* AP%% AP%& AP&* AP&% AP&& AQ** AQ*% AQ*& AQ%* AQ%% AQ%& AQ&* AQ&% AQ&&).sort
15
+ end
16
+
17
+ it 'will not expand explosive combinations (>MAX_NUM_AA_EXPANSION)' do
18
+ # this is from real data
19
+ worst_case = 'LTLLRPEKHEAATGVDTICTHRVDPIGPGLXXEXLYWELSXLTXXIXELGPYTLDR'
20
+ subject.expand_peptides(worst_case, 'X' => %w(* % &)).nil?.should == true
21
+ end
22
+
23
+ it 'returns the peptide in the array if no expansion' do
24
+ array = subject.expand_peptides('ZZZZZ', 'X' => %w(* % &), 'L' => %w(P Q) )
25
+ array.should == ['ZZZZZ']
26
+ end
27
+ end
28
+
29
+ describe 'the commandline utility' do
30
+
31
+ before do
32
+ @fasta_file = [TESTFILES, path, 'uni_11_sp_tr.fasta'].join('/')
33
+ @output_file = [TESTFILES, path, "uni_11_sp_tr.msd_clvg2.min_aaseq4.yml"].join('/')
34
+ end
35
+
36
+ it 'converts a fasta file into peptide centric db' do
37
+ output_files = Mspire::Ident::Peptide::Db::Creator.cmdline([@fasta_file])
38
+ output_files.first.should == File.expand_path(@output_file)
39
+ File.exist?(@output_file).should == true
40
+ hash = {}
41
+ YAML.load_file(@output_file).each do |k,v|
42
+ hash[k] = v.split("\t")
43
+ end
44
+ sorted = hash.sort
45
+ # these are merely frozen, not perfectly defined
46
+ sorted.first.should == ["AAFDDAIAELDTLSEESYK", ["P62258"]]
47
+ sorted.last.should == ["YWCRLGPPRWICQTIVSTNQYTHHR", ["D2KTA8"]]
48
+ sorted.size.should == 728
49
+ #File.unlink(@output_file)
50
+ end
51
+
52
+ it 'lists approved enzymes and exits' do
53
+ output = capture_stdout do
54
+ begin
55
+ Mspire::Ident::Peptide::Db::Creator.cmdline(['--list-enzymes'])
56
+ rescue SystemExit
57
+ 1.should == 1 # we exited
58
+ end
59
+ end
60
+ lines = output.split("\n")
61
+ lines.include?("trypsin").should == true
62
+ lines.include?("chymotrypsin").should == true
63
+ end
64
+ end
65
+ end
@@ -0,0 +1,21 @@
1
+ require 'spec_helper'
2
+
3
+ require 'mspire/ident/peptide/db/io'
4
+
5
+ describe 'reading a peptide centric DB' do
6
+ before do
7
+ @pepcentric = TESTFILES + "/mspire/ident/peptide/db/uni_11_sp_tr.PEPTIDE_CENTRIC.yml"
8
+ end
9
+
10
+ it 'reads the file on disk with random access or is enumerable' do
11
+ Mspire::Ident::Peptide::Db::IO.open(@pepcentric) do |io|
12
+ io["AVTEQGHELSNEER"].should == %w(sp|P31946|1433B_HUMAN sp|P31946-2|1433B_HUMAN)
13
+ io["VRAAR"].should == ["tr|D3DX18|D3DX18_HUMAN"]
14
+ io["SILLY WILLY"].should be_nil
15
+ io.each_with_index do |key_prots, i|
16
+ key_prots.first.should be_an_instance_of String
17
+ key_prots.last.should be_a_kind_of Array
18
+ end
19
+ end
20
+ end
21
+ end
@@ -4,105 +4,15 @@ require 'yaml'
4
4
  path = 'mspire/ident/peptide/db'
5
5
  require path
6
6
 
7
- module Kernel
8
-
9
- def capture_stdout
10
- out = StringIO.new
11
- $stdout = out
12
- yield
13
- out.rewind
14
- return out.read
15
- ensure
16
- $stdout = STDOUT
17
- end
18
-
19
- end
20
-
21
-
22
- describe 'a uniprot fasta file' do
23
-
7
+ describe 'reading a peptide centric db' do
24
8
  before do
25
- @fasta_file = [TESTFILES, path, 'uni_11_sp_tr.fasta'].join('/')
26
- end
27
-
28
- describe 'amino acid expansion' do
29
-
30
- it 'can expand out wildcard amino acid combinations' do
31
- array = Mspire::Ident::Peptide::Db.expand_peptides('ALXX', 'X' => %w(* % &), 'L' => %w(P Q) )
32
- array.sort.should == %w(AP** AP*% AP*& AP%* AP%% AP%& AP&* AP&% AP&& AQ** AQ*% AQ*& AQ%* AQ%% AQ%& AQ&* AQ&% AQ&&).sort
33
- end
34
-
35
- it 'will not expand explosive combinations (>MAX_NUM_AA_EXPANSION)' do
36
- # this is from real data
37
- worst_case = 'LTLLRPEKHEAATGVDTICTHRVDPIGPGLXXEXLYWELSXLTXXIXELGPYTLDR'
38
- Mspire::Ident::Peptide::Db.expand_peptides(worst_case, 'X' => %w(* % &)).nil?.should == true
39
- end
40
-
41
- it 'returns the peptide in the array if no expansion' do
42
- array = Mspire::Ident::Peptide::Db.expand_peptides('ZZZZZ', 'X' => %w(* % &), 'L' => %w(P Q) )
43
- array.should == ['ZZZZZ']
44
- end
45
-
9
+ @pepcentric = TESTFILES + "/mspire/ident/peptide/db/uni_11_sp_tr.PEPTIDE_CENTRIC.yml"
46
10
  end
47
11
 
48
- describe 'creating a peptide centric database' do
49
- before do
50
-
51
- #@output_file = [TESTFILES, path, 'uni_11_sp_tr.'].join('/')
52
- @output_file = [TESTFILES, path, "uni_11_sp_tr.msd_clvg2.min_aaseq4.yml"].join('/')
53
- end
54
-
55
- it 'converts a fasta file into peptide centric db' do
56
- output_files = Mspire::Ident::Peptide::Db.cmdline([@fasta_file])
57
- output_files.first.should == File.expand_path(@output_file)
58
- File.exist?(@output_file).should == true
59
- hash = {}
60
- YAML.load_file(@output_file).each do |k,v|
61
- hash[k] = v.split("\t")
62
- end
63
- sorted = hash.sort
64
- # these are merely frozen, not perfectly defined
65
- sorted.first.should == ["AAFDDAIAELDTLSEESYK", ["sp|P62258|1433E_HUMAN"]]
66
- sorted.last.should == ["YWCRLGPPRWICQTIVSTNQYTHHR", ["tr|D2KTA8|D2KTA8_HUMAN"]]
67
- sorted.size.should == 728
68
- File.unlink(@output_file)
69
- end
70
-
71
- it 'lists approved enzymes and exits' do
72
- output = capture_stdout do
73
- begin
74
- Mspire::Ident::Peptide::Db.cmdline(['--list-enzymes'])
75
- rescue SystemExit
76
- 1.should == 1 # we exited
77
- end
78
- end
79
- lines = output.split("\n")
80
- lines.include?("trypsin").should == true
81
- lines.include?("chymotrypsin").should == true
82
- end
83
- end
84
-
85
- describe 'reading a peptide centric database' do
86
- before do
87
- outfiles = Mspire::Ident::Peptide::Db.cmdline([@fasta_file])
88
- @outfile = outfiles.first
89
- end
90
-
91
- it 'creates a hash that can retrieve peptides as an array' do
92
- hash = Mspire::Ident::Peptide::Db.new(@outfile)
93
- hash["AVTEQGHELSNEER"].should == %w(sp|P31946|1433B_HUMAN sp|P31946-2|1433B_HUMAN)
94
- hash["VRAAR"].should == ["tr|D3DX18|D3DX18_HUMAN"]
95
- end
96
-
97
- it 'reads the file on disk with random access or is enumerable' do
98
- Mspire::Ident::Peptide::Db::IO.open(@outfile) do |io|
99
- io["AVTEQGHELSNEER"].should == %w(sp|P31946|1433B_HUMAN sp|P31946-2|1433B_HUMAN)
100
- io["VRAAR"].should == ["tr|D3DX18|D3DX18_HUMAN"]
101
- io.each_with_index do |key_prots, i|
102
- key_prots.first.should be_an_instance_of String
103
- key_prots.last.should be_a_kind_of Array
104
- end
105
- end
106
- end
12
+ it 'creates a hash that can retrieve peptides as an array' do
13
+ hash = Mspire::Ident::Peptide::Db.new(@pepcentric)
14
+ hash["AVTEQGHELSNEER"].should == ["sp|P31946|1433B_HUMAN", "sp|P31946-2|1433B_HUMAN"]
15
+ hash["VRAAR"].should == ["tr|D3DX18|D3DX18_HUMAN"]
16
+ hash["BANNANA"].should == nil
107
17
  end
108
18
  end