mspire 0.8.4 → 0.8.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,118 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require 'trollop'
4
+ require 'set'
5
+ require 'mspire/ident/peptide_hit/qvalue'
6
+ require 'mspire/error_rate/qvalue'
7
+
8
+ begin
9
+ require 'mascot/dat'
10
+ rescue LoadError
11
+ puts "You need the mascot-dat gem for this to work!"
12
+ puts "AND IT MUST BE THE PRINCELAB GITHUB FORK until changes get incorporated upstream!"
13
+ puts "> gem install mascot-dat"
14
+ raise LoadError
15
+ end
16
+ raise "need princelab mascot-dat gem!" unless Mascot::DAT::VERSION == "0.3.1.1"
17
+
18
+ # target-decoy bundle
19
+ SearchBundle = Struct.new(:target, :decoy) do
20
+ # combines all bundles under self so that all targets are grouped and all
21
+ # decoys are grouped. returns self
22
+ def combine(bundles)
23
+ (targets, decoys) = bundles.map {|bundle| [bundle.target, bundle.decoy] }
24
+ .transpose.map {|ars| ars.reduce(:+) }
25
+ self
26
+ end
27
+ end
28
+
29
+ PSM = Struct.new(:search_id, :id, :aaseq, :charge, :score)
30
+
31
+ # turns 1+ into 1
32
+ def charge_string_to_charge(st)
33
+ md = st.match(/(\d)([\+\-])/)
34
+ i = md[1].to_i
35
+ i *= -1 if (md[2] == '-')
36
+ i
37
+ end
38
+
39
+ def read_mascot_dat_hits(dat_file)
40
+ filename =nil
41
+ IO.foreach(dat_file) do |line|
42
+ if line =~ /^FILE=(.*?).mgf/i
43
+ filename = $1.dup
44
+ break
45
+ end
46
+ end
47
+ dat = Mascot::DAT.open(dat_file)
48
+
49
+ data = [:peptides, :decoy_peptides].map do |mthd|
50
+ psms = []
51
+ dat.send(mthd).each do |psm|
52
+ next unless psm.query
53
+ query = dat.query(psm.query)
54
+ charge = charge_string_to_charge(query.charge)
55
+ psms << PSM.new(filename, query.title, psm.pep, charge, psm.score) if psm.score
56
+ end
57
+ psms
58
+ end
59
+ dat.close
60
+ SearchBundle.new(*data)
61
+ end
62
+
63
+
64
+ def putsv(*args)
65
+ puts(*args) if $VERBOSE
66
+ $stdout.flush
67
+ end
68
+
69
+ EXT = Mspire::Ident::PeptideHit::Qvalue::FILE_EXTENSION
70
+ combine_base = "combined"
71
+
72
+ opts = Trollop::Parser.new do
73
+ #banner %Q{usage: #{File.basename(__FILE__)} <target>.xml <decoy>.xml ...
74
+ banner %Q{usage: #{File.basename(__FILE__)} <mascot>.dat ...
75
+ outputs: <mascot>.phq.tsv
76
+ assumes a decoy search was run *with* the initial search
77
+ phq.tsv?: see schema/peptide_hit_qvalues.phq.tsv
78
+ }
79
+ opt :combine, "groups target and decoy hits together from all files, writing to #{combine_base}#{EXT}", :default => false
80
+ opt :z_together, "do not group by charge state", :default => false
81
+ opt :verbose, "be verbose", :default => false
82
+ end
83
+
84
+ opt = opts.parse(ARGV)
85
+ if ARGV.size == 0
86
+ opts.educate
87
+ exit
88
+ end
89
+
90
+ $VERBOSE = opt.delete(:verbose)
91
+
92
+ files = ARGV.to_a
93
+
94
+ bundles = files.map do |file|
95
+ # assumes the file has both target and decoy hits
96
+ read_mascot_dat_hits(file)
97
+ end
98
+
99
+ to_run = {}
100
+ if opt[:combine]
101
+ putsv "combining all target hits together and all decoy hits together"
102
+ bundle = SearchBundle.new.combine(bundles)
103
+ to_run[combine_base + EXT] = bundle
104
+ else
105
+ files.zip(bundles) do |file, bundle|
106
+ to_run[file.chomp(File.extname(file)) + EXT] = bundle
107
+ end
108
+ end
109
+
110
+ to_run.each do |file, bundle|
111
+ putsv "calculating qvalues for #{file}"
112
+ hit_qvalue_pairs = Mspire::ErrorRate::Qvalue.target_decoy_qvalues(bundle.target, bundle.decoy, :z_together => opt[:z_together])
113
+ # {|hit| hit.search_scores[:ionscore] }
114
+ outfile = Mspire::Ident::PeptideHit::Qvalue.to_file(file, *hit_qvalue_pairs.transpose)
115
+ putsv "created: #{outfile}"
116
+ end
117
+
118
+
@@ -0,0 +1,65 @@
1
+ require 'spec_helper'
2
+
3
+ require 'yaml'
4
+ path = 'mspire/ident/peptide/db'
5
+ require path + "/creator"
6
+
7
+ describe 'creating a peptide centric database' do
8
+ subject { Mspire::Ident::Peptide::Db::Creator.new }
9
+
10
+ describe 'amino acid expansion' do
11
+
12
+ it 'can expand out wildcard amino acid combinations' do
13
+ array = subject.expand_peptides('ALXX', 'X' => %w(* % &), 'L' => %w(P Q) )
14
+ array.sort.should == %w(AP** AP*% AP*& AP%* AP%% AP%& AP&* AP&% AP&& AQ** AQ*% AQ*& AQ%* AQ%% AQ%& AQ&* AQ&% AQ&&).sort
15
+ end
16
+
17
+ it 'will not expand explosive combinations (>MAX_NUM_AA_EXPANSION)' do
18
+ # this is from real data
19
+ worst_case = 'LTLLRPEKHEAATGVDTICTHRVDPIGPGLXXEXLYWELSXLTXXIXELGPYTLDR'
20
+ subject.expand_peptides(worst_case, 'X' => %w(* % &)).nil?.should == true
21
+ end
22
+
23
+ it 'returns the peptide in the array if no expansion' do
24
+ array = subject.expand_peptides('ZZZZZ', 'X' => %w(* % &), 'L' => %w(P Q) )
25
+ array.should == ['ZZZZZ']
26
+ end
27
+ end
28
+
29
+ describe 'the commandline utility' do
30
+
31
+ before do
32
+ @fasta_file = [TESTFILES, path, 'uni_11_sp_tr.fasta'].join('/')
33
+ @output_file = [TESTFILES, path, "uni_11_sp_tr.msd_clvg2.min_aaseq4.yml"].join('/')
34
+ end
35
+
36
+ it 'converts a fasta file into peptide centric db' do
37
+ output_files = Mspire::Ident::Peptide::Db::Creator.cmdline([@fasta_file])
38
+ output_files.first.should == File.expand_path(@output_file)
39
+ File.exist?(@output_file).should == true
40
+ hash = {}
41
+ YAML.load_file(@output_file).each do |k,v|
42
+ hash[k] = v.split("\t")
43
+ end
44
+ sorted = hash.sort
45
+ # these are merely frozen, not perfectly defined
46
+ sorted.first.should == ["AAFDDAIAELDTLSEESYK", ["P62258"]]
47
+ sorted.last.should == ["YWCRLGPPRWICQTIVSTNQYTHHR", ["D2KTA8"]]
48
+ sorted.size.should == 728
49
+ #File.unlink(@output_file)
50
+ end
51
+
52
+ it 'lists approved enzymes and exits' do
53
+ output = capture_stdout do
54
+ begin
55
+ Mspire::Ident::Peptide::Db::Creator.cmdline(['--list-enzymes'])
56
+ rescue SystemExit
57
+ 1.should == 1 # we exited
58
+ end
59
+ end
60
+ lines = output.split("\n")
61
+ lines.include?("trypsin").should == true
62
+ lines.include?("chymotrypsin").should == true
63
+ end
64
+ end
65
+ end
@@ -0,0 +1,21 @@
1
+ require 'spec_helper'
2
+
3
+ require 'mspire/ident/peptide/db/io'
4
+
5
+ describe 'reading a peptide centric DB' do
6
+ before do
7
+ @pepcentric = TESTFILES + "/mspire/ident/peptide/db/uni_11_sp_tr.PEPTIDE_CENTRIC.yml"
8
+ end
9
+
10
+ it 'reads the file on disk with random access or is enumerable' do
11
+ Mspire::Ident::Peptide::Db::IO.open(@pepcentric) do |io|
12
+ io["AVTEQGHELSNEER"].should == %w(sp|P31946|1433B_HUMAN sp|P31946-2|1433B_HUMAN)
13
+ io["VRAAR"].should == ["tr|D3DX18|D3DX18_HUMAN"]
14
+ io["SILLY WILLY"].should be_nil
15
+ io.each_with_index do |key_prots, i|
16
+ key_prots.first.should be_an_instance_of String
17
+ key_prots.last.should be_a_kind_of Array
18
+ end
19
+ end
20
+ end
21
+ end
@@ -4,105 +4,15 @@ require 'yaml'
4
4
  path = 'mspire/ident/peptide/db'
5
5
  require path
6
6
 
7
- module Kernel
8
-
9
- def capture_stdout
10
- out = StringIO.new
11
- $stdout = out
12
- yield
13
- out.rewind
14
- return out.read
15
- ensure
16
- $stdout = STDOUT
17
- end
18
-
19
- end
20
-
21
-
22
- describe 'a uniprot fasta file' do
23
-
7
+ describe 'reading a peptide centric db' do
24
8
  before do
25
- @fasta_file = [TESTFILES, path, 'uni_11_sp_tr.fasta'].join('/')
26
- end
27
-
28
- describe 'amino acid expansion' do
29
-
30
- it 'can expand out wildcard amino acid combinations' do
31
- array = Mspire::Ident::Peptide::Db.expand_peptides('ALXX', 'X' => %w(* % &), 'L' => %w(P Q) )
32
- array.sort.should == %w(AP** AP*% AP*& AP%* AP%% AP%& AP&* AP&% AP&& AQ** AQ*% AQ*& AQ%* AQ%% AQ%& AQ&* AQ&% AQ&&).sort
33
- end
34
-
35
- it 'will not expand explosive combinations (>MAX_NUM_AA_EXPANSION)' do
36
- # this is from real data
37
- worst_case = 'LTLLRPEKHEAATGVDTICTHRVDPIGPGLXXEXLYWELSXLTXXIXELGPYTLDR'
38
- Mspire::Ident::Peptide::Db.expand_peptides(worst_case, 'X' => %w(* % &)).nil?.should == true
39
- end
40
-
41
- it 'returns the peptide in the array if no expansion' do
42
- array = Mspire::Ident::Peptide::Db.expand_peptides('ZZZZZ', 'X' => %w(* % &), 'L' => %w(P Q) )
43
- array.should == ['ZZZZZ']
44
- end
45
-
9
+ @pepcentric = TESTFILES + "/mspire/ident/peptide/db/uni_11_sp_tr.PEPTIDE_CENTRIC.yml"
46
10
  end
47
11
 
48
- describe 'creating a peptide centric database' do
49
- before do
50
-
51
- #@output_file = [TESTFILES, path, 'uni_11_sp_tr.'].join('/')
52
- @output_file = [TESTFILES, path, "uni_11_sp_tr.msd_clvg2.min_aaseq4.yml"].join('/')
53
- end
54
-
55
- it 'converts a fasta file into peptide centric db' do
56
- output_files = Mspire::Ident::Peptide::Db.cmdline([@fasta_file])
57
- output_files.first.should == File.expand_path(@output_file)
58
- File.exist?(@output_file).should == true
59
- hash = {}
60
- YAML.load_file(@output_file).each do |k,v|
61
- hash[k] = v.split("\t")
62
- end
63
- sorted = hash.sort
64
- # these are merely frozen, not perfectly defined
65
- sorted.first.should == ["AAFDDAIAELDTLSEESYK", ["sp|P62258|1433E_HUMAN"]]
66
- sorted.last.should == ["YWCRLGPPRWICQTIVSTNQYTHHR", ["tr|D2KTA8|D2KTA8_HUMAN"]]
67
- sorted.size.should == 728
68
- File.unlink(@output_file)
69
- end
70
-
71
- it 'lists approved enzymes and exits' do
72
- output = capture_stdout do
73
- begin
74
- Mspire::Ident::Peptide::Db.cmdline(['--list-enzymes'])
75
- rescue SystemExit
76
- 1.should == 1 # we exited
77
- end
78
- end
79
- lines = output.split("\n")
80
- lines.include?("trypsin").should == true
81
- lines.include?("chymotrypsin").should == true
82
- end
83
- end
84
-
85
- describe 'reading a peptide centric database' do
86
- before do
87
- outfiles = Mspire::Ident::Peptide::Db.cmdline([@fasta_file])
88
- @outfile = outfiles.first
89
- end
90
-
91
- it 'creates a hash that can retrieve peptides as an array' do
92
- hash = Mspire::Ident::Peptide::Db.new(@outfile)
93
- hash["AVTEQGHELSNEER"].should == %w(sp|P31946|1433B_HUMAN sp|P31946-2|1433B_HUMAN)
94
- hash["VRAAR"].should == ["tr|D3DX18|D3DX18_HUMAN"]
95
- end
96
-
97
- it 'reads the file on disk with random access or is enumerable' do
98
- Mspire::Ident::Peptide::Db::IO.open(@outfile) do |io|
99
- io["AVTEQGHELSNEER"].should == %w(sp|P31946|1433B_HUMAN sp|P31946-2|1433B_HUMAN)
100
- io["VRAAR"].should == ["tr|D3DX18|D3DX18_HUMAN"]
101
- io.each_with_index do |key_prots, i|
102
- key_prots.first.should be_an_instance_of String
103
- key_prots.last.should be_a_kind_of Array
104
- end
105
- end
106
- end
12
+ it 'creates a hash that can retrieve peptides as an array' do
13
+ hash = Mspire::Ident::Peptide::Db.new(@pepcentric)
14
+ hash["AVTEQGHELSNEER"].should == ["sp|P31946|1433B_HUMAN", "sp|P31946-2|1433B_HUMAN"]
15
+ hash["VRAAR"].should == ["tr|D3DX18|D3DX18_HUMAN"]
16
+ hash["BANNANA"].should == nil
107
17
  end
108
18
  end