mspire 0.1.5 → 0.1.7
Sign up to get free protection for your applications and to get access to all the features.
- data/Rakefile +5 -2
- data/bin/bioworks_to_pepxml.rb +84 -40
- data/bin/fasta_shaker.rb +100 -0
- data/bin/filter_spec_id.rb +185 -23
- data/bin/gi2annot.rb +2 -110
- data/bin/id_class_anal.rb +31 -21
- data/bin/id_precision.rb +12 -8
- data/bin/{false_positive_rate.rb → precision.rb} +1 -1
- data/bin/protein_summary.rb +55 -62
- data/changelog.txt +34 -0
- data/lib/align.rb +0 -1
- data/lib/fasta.rb +88 -24
- data/lib/gi.rb +114 -0
- data/lib/roc.rb +64 -58
- data/lib/spec_id/aa_freqs.rb +166 -0
- data/lib/spec_id/bioworks.rb +5 -1
- data/lib/spec_id/precision.rb +427 -0
- data/lib/spec_id/proph.rb +2 -2
- data/lib/spec_id/sequest.rb +810 -113
- data/lib/spec_id/srf.rb +486 -0
- data/lib/spec_id.rb +107 -23
- data/release_notes.txt +11 -0
- data/script/estimate_fpr_by_cysteine.rb +226 -0
- data/script/filter-peps.rb +3 -3
- data/script/find_cysteine_background.rb +137 -0
- data/script/gen_database_searching.rb +11 -7
- data/script/genuine_tps_and_probs.rb +136 -0
- data/script/top_hit_per_scan.rb +5 -2
- data/test/tc_aa_freqs.rb +59 -0
- data/test/tc_bioworks.rb +6 -1
- data/test/tc_bioworks_to_pepxml.rb +25 -18
- data/test/tc_fasta.rb +81 -3
- data/test/tc_fasta_shaker.rb +147 -0
- data/test/tc_gi.rb +20 -0
- data/test/tc_id_class_anal.rb +9 -12
- data/test/tc_id_precision.rb +12 -11
- data/test/{tc_false_positive_rate.rb → tc_precision.rb} +13 -22
- data/test/tc_protein_summary.rb +31 -22
- data/test/tc_roc.rb +95 -50
- data/test/tc_sequest.rb +212 -145
- data/test/tc_spec.rb +10 -5
- data/test/tc_spec_id.rb +0 -2
- data/test/tc_spec_id_xml.rb +36 -0
- data/test/tc_srf.rb +216 -0
- metadata +35 -21
- data/lib/spec_id/false_positive_rate.rb +0 -476
- data/test/tc_gi2annot.rb +0 -12
@@ -0,0 +1,136 @@
|
|
1
|
+
#!/usr/bin/ruby -w
|
2
|
+
|
3
|
+
# Here is what I plotted: take each id'd pep-prot id'd on the tophit scans -- you will likely have the same pep-prot id'd on multiple scans -- plot the top probability of each such pep-prot.
|
4
|
+
# There are 43 such id'd peptides for Sashimi, whereas SEQUEST id's about 66. So you'll have 66 (1-p-values) to plot, I had 43. Similarly for OMICS.
|
5
|
+
|
6
|
+
require 'spec_id'
|
7
|
+
require 'fasta'
|
8
|
+
require 'optparse'
|
9
|
+
require 'ostruct'
|
10
|
+
|
11
|
+
# returns an accession number if available, or the entire reference (less the
|
12
|
+
# starting '>'
|
13
|
+
def get_fasta_accession(fasta_prot)
|
14
|
+
head = fasta_prot.header
|
15
|
+
if head =~ ACC_REGEX
|
16
|
+
$1.dup
|
17
|
+
else
|
18
|
+
head.sub(/^>/, '').rstrip
|
19
|
+
end
|
20
|
+
end
|
21
|
+
|
22
|
+
# returns the accession number from a reference, or the complete reference
|
23
|
+
def accession_from_ref(pep)
|
24
|
+
ref = pep.prot.reference
|
25
|
+
if ref =~ ACC_REGEX
|
26
|
+
$1.dup
|
27
|
+
else
|
28
|
+
ref.rstrip
|
29
|
+
end
|
30
|
+
end
|
31
|
+
|
32
|
+
def get_pep_prot_accession(pep)
|
33
|
+
acc = pep.prot.accession
|
34
|
+
if !acc || acc == '0' || acc == 0
|
35
|
+
accession_from_ref(pep)
|
36
|
+
else
|
37
|
+
acc
|
38
|
+
end
|
39
|
+
end
|
40
|
+
|
41
|
+
#####################################################################
|
42
|
+
# MAIN
|
43
|
+
#####################################################################
|
44
|
+
|
45
|
+
opt = OpenStruct.new
|
46
|
+
opt.p = 'prob'
|
47
|
+
opts = OptionParser.new do |op|
|
48
|
+
op.banner = "usage: #{File.basename(__FILE__)} bioworks.xml true_hits.fasta"
|
49
|
+
op.separator " [prints to stdout tab delimited table]"
|
50
|
+
op.on('-t', '--ties', 'allow ties on best hit') {|v| opt.t = v }
|
51
|
+
op.on('-p', '--param <s>', 'param: (xcorr | prob)') {|v| opt.p = v}
|
52
|
+
end
|
53
|
+
opts.parse!
|
54
|
+
|
55
|
+
if ARGV.size < 2
|
56
|
+
puts opts
|
57
|
+
exit
|
58
|
+
end
|
59
|
+
|
60
|
+
case opt.p
|
61
|
+
when 'prob'
|
62
|
+
param = :peptide_probability
|
63
|
+
best = :first
|
64
|
+
when 'xcorr'
|
65
|
+
param = :xcorr
|
66
|
+
best = :last
|
67
|
+
else
|
68
|
+
abort "incorrect param: #{opt.p}"
|
69
|
+
end
|
70
|
+
|
71
|
+
############################
|
72
|
+
# GLOBALS
|
73
|
+
DELIM = "\t"
|
74
|
+
ACC_REGEX = /\|(.*?)\|/o
|
75
|
+
############################
|
76
|
+
|
77
|
+
bioworks = ARGV[0]
|
78
|
+
fasta_file = ARGV[1]
|
79
|
+
|
80
|
+
fprots = Fasta.new.read_file(fasta_file).prots
|
81
|
+
gi_nums = fprots.map {|prot| get_fasta_accession(prot) }
|
82
|
+
|
83
|
+
peptides = SpecID.new(bioworks).peps
|
84
|
+
|
85
|
+
|
86
|
+
## Get the best peptide(s) per scan
|
87
|
+
top_peps_per_scan = []
|
88
|
+
|
89
|
+
peptides.hash_by(:base_name, :first_scan).each do |bn_scan, pep_array|
|
90
|
+
sorted_list = pep_array.sort_by {|pep| pep.send(param).to_f }
|
91
|
+
|
92
|
+
top_peps = if best == :first ; [sorted_list.shift] ; else [sorted_list.pop] end
|
93
|
+
found_another = false
|
94
|
+
sorted_list.each do |pep|
|
95
|
+
if pep.send(param).to_f == top_peps.send(best).send(param).to_f
|
96
|
+
if opt.t
|
97
|
+
top_peps << pep
|
98
|
+
else
|
99
|
+
found_another = true
|
100
|
+
end
|
101
|
+
end
|
102
|
+
end
|
103
|
+
unless found_another
|
104
|
+
top_peps_per_scan.push( *top_peps )
|
105
|
+
end
|
106
|
+
end
|
107
|
+
|
108
|
+
|
109
|
+
## Get the best scoring peptide per peptide/prot from list of best
|
110
|
+
## peptides/scan
|
111
|
+
top_pep_seq_prots = top_peps_per_scan.hash_by {|pep| [pep.sequence, get_pep_prot_accession(pep)] }.map do |k,pep_array|
|
112
|
+
pep_array.sort_by {|pep| pep.send(param).to_f }.send(best)
|
113
|
+
end
|
114
|
+
|
115
|
+
## sort the peptides by best score
|
116
|
+
sorted_top_pep_seq_prots = top_pep_seq_prots.sort_by {|pep| pep.send(param).to_f }
|
117
|
+
if best == :last ; sorted_top_pep_seq_prots.reverse! end
|
118
|
+
|
119
|
+
## plot the probability vs. the number of tps
|
120
|
+
puts ['#TPs', param, 'sequence', 'protein accession', 'xcorr'].join(DELIM)
|
121
|
+
tps = 0
|
122
|
+
sorted_top_pep_seq_prots.each do |pep|
|
123
|
+
if gi_nums.include?( get_pep_prot_accession(pep) )
|
124
|
+
tps += 1
|
125
|
+
puts [tps.to_s, pep.send(param), pep.sequence, get_pep_prot_accession(pep), pep.xcorr].join(DELIM)
|
126
|
+
end
|
127
|
+
end
|
128
|
+
|
129
|
+
|
130
|
+
|
131
|
+
|
132
|
+
|
133
|
+
|
134
|
+
|
135
|
+
|
136
|
+
|
data/script/top_hit_per_scan.rb
CHANGED
@@ -4,7 +4,6 @@
|
|
4
4
|
cats = %w(base_name sequence xcorr deltacn first_scan last_scan)
|
5
5
|
###################################################################
|
6
6
|
|
7
|
-
require 'pp'
|
8
7
|
require 'spec_id'
|
9
8
|
require 'hash_by'
|
10
9
|
|
@@ -46,7 +45,11 @@ outfile_top = file.sub(/\.xml$/, extension_top)
|
|
46
45
|
outfile_all = file.sub(/\.xml$/, extension_all)
|
47
46
|
|
48
47
|
sp = SpecID.new(file)
|
49
|
-
|
48
|
+
|
49
|
+
# The old (incorrect version)
|
50
|
+
# pep_hash = sp.peps.hash_by(:first_scan, :last_scan)
|
51
|
+
# The correct version:
|
52
|
+
pep_hash = sp.peps.hash_by(:base_name, :first_scan, :last_scan)
|
50
53
|
top_per_scan = pep_hash.map {|k,v| v.sort_by {|ob| ob.xcorr.to_f }.last }
|
51
54
|
top_per_scan = top_per_scan.sort_by {|pep| pep.first_scan.to_i }
|
52
55
|
|
data/test/tc_aa_freqs.rb
ADDED
@@ -0,0 +1,59 @@
|
|
1
|
+
|
2
|
+
|
3
|
+
require 'test/unit'
|
4
|
+
require 'spec_id/aa_freqs'
|
5
|
+
|
6
|
+
|
7
|
+
class FastaTest < Test::Unit::TestCase
|
8
|
+
|
9
|
+
def initialize(arg)
|
10
|
+
super(arg)
|
11
|
+
@tfiles = File.dirname(__FILE__) + '/tfiles/'
|
12
|
+
@sf = @tfiles + "small.fasta"
|
13
|
+
end
|
14
|
+
|
15
|
+
def test_basic
|
16
|
+
obj = SpecID::AAFreqs.new(@sf)
|
17
|
+
expect = {:I=>0.0628918621937819, :S=>0.0539719475147049, :D=>0.0526145691939758, :Z=>0.0, :L=>0.102772929998061, :T=>0.0491888048607071, :E=>0.0609527503070261, :O=>0.0, :C=>0.0157714433456144, :K=>0.0471850559110594, :U=>0.0, :Q=>0.0382651412319824, :W=>0.0137030573330748, :A=>0.101997285243359, :M=>0.0294745006786892, :J=>0.0, :G=>0.0811195139292871, :Y=>0.0254670027793937, :X=>0.0, :F=>0.0418201796910348, :R=>0.0546829552065154, :V=>0.0702604873634542, :H=>0.0213302307543145, :B=>0.0, :N=>0.03471010277293, :P=>0.0418201796910348}
|
18
|
+
aaf = obj.aafreqs
|
19
|
+
expect.each do |k,v|
|
20
|
+
assert(aaf.key?(k))
|
21
|
+
assert_in_delta(v, aaf[k], 0.00000001, "freqs match up")
|
22
|
+
end
|
23
|
+
sum = 0.0
|
24
|
+
aaf.values.each do |v|
|
25
|
+
sum += v
|
26
|
+
end
|
27
|
+
assert_in_delta(1.0, sum, 0.0000000000001, "all freqs add to 1")
|
28
|
+
end
|
29
|
+
|
30
|
+
def test_probability_of_length_table
|
31
|
+
# p SpecID::AAFreqs.probability_of_length_table(0.01, 4)
|
32
|
+
assert_equal_arrs_in_delta([0.0, 0.01, 0.0199, 0.029701, 0.0394039900000001], SpecID::AAFreqs.probability_of_length_table(0.01, 4), 0.000000001)
|
33
|
+
|
34
|
+
assert_equal_arrs_in_delta([0.0, 0.2, 0.36, 0.488, 0.5904], SpecID::AAFreqs.probability_of_length_table(0.2, 4), 0.000000001)
|
35
|
+
end
|
36
|
+
|
37
|
+
def test_actual_and_expected_number
|
38
|
+
fobj = Fasta.new.read_file(@sf)
|
39
|
+
obj = SpecID::AAFreqs.new
|
40
|
+
obj.aafreqs = obj.calculate_frequencies(fobj)
|
41
|
+
|
42
|
+
peptide_aaseqs = fobj.prots.map do |prot|
|
43
|
+
prot.aaseq[0..12]
|
44
|
+
end
|
45
|
+
assert_equal(50, peptide_aaseqs.size, 'sanity check')
|
46
|
+
(ac,ex) = obj.actual_and_expected_number(peptide_aaseqs, :C, 1)
|
47
|
+
assert_equal(9, ac)
|
48
|
+
assert_in_delta( 9.33530631238985, ex, 0.0000000001)
|
49
|
+
end
|
50
|
+
|
51
|
+
private
|
52
|
+
def assert_equal_arrs_in_delta(expect, actual, delta)
|
53
|
+
expect.each_with_index do |v,i|
|
54
|
+
assert_in_delta(v, actual[i], delta)
|
55
|
+
end
|
56
|
+
end
|
57
|
+
|
58
|
+
|
59
|
+
end
|
data/test/tc_bioworks.rb
CHANGED
@@ -8,7 +8,8 @@ class BioworksTest < Test::Unit::TestCase
|
|
8
8
|
def initialize(arg)
|
9
9
|
super(arg)
|
10
10
|
@tfiles = File.dirname(__FILE__) + '/tfiles/'
|
11
|
-
@
|
11
|
+
@tfiles_l = File.dirname(__FILE__) + '/tfiles_large/'
|
12
|
+
@tf_bioworks_xml = @tfiles_l + "bioworks.xml"
|
12
13
|
@tf_bioworks_xml_small = @tfiles + "bioworks_small.xml"
|
13
14
|
@tf_bioworks_xml_really_small = @tfiles + "bioworks_with_INV_small.xml"
|
14
15
|
@tf_params = @tfiles + "bioworks32.params"
|
@@ -34,9 +35,13 @@ class BioworksTest < Test::Unit::TestCase
|
|
34
35
|
end
|
35
36
|
|
36
37
|
def Xtest_xml_parsing_speed
|
38
|
+
if File.exist? @tfiles_l
|
37
39
|
#puts Benchmark.bm {|b|
|
38
40
|
obj = SpecID::Bioworks.new(@tf_bioworks_xml)
|
39
41
|
#}
|
42
|
+
else
|
43
|
+
assert_nil( puts("--SKIPPING TEST-- (missing dir: #{@tfiles_l})") )
|
44
|
+
end
|
40
45
|
end
|
41
46
|
|
42
47
|
def test_xml_parsing_bioworks_single
|
@@ -9,11 +9,10 @@ class BioworksToPepXMLTest < Test::Unit::TestCase
|
|
9
9
|
def initialize(arg)
|
10
10
|
super(arg)
|
11
11
|
@tfiles = File.dirname(__FILE__) + '/tfiles/'
|
12
|
-
@
|
12
|
+
@tfiles_l = File.dirname(__FILE__) + '/tfiles_large/'
|
13
|
+
@tf_mzxml_path = @tfiles_l + "yeast_gly_mzXML"
|
13
14
|
@tf_bioworks_xml = @tfiles + "bioworks_small.xml"
|
14
15
|
@tf_params = @tfiles + "bioworks32.params"
|
15
|
-
@tf_opd1 = @tfiles + "opd1/bioworks.000.oldparams.xml"
|
16
|
-
@tf_opd1_mzxml = @tfiles + "opd1/000.mzXML.timeIndex"
|
17
16
|
@no_delete = false
|
18
17
|
@out_path = @tfiles + 'pepxml/'
|
19
18
|
@cmd = "ruby -I#{File.join(File.dirname(__FILE__), "..", "lib")} -S bioworks_to_pepxml.rb "
|
@@ -33,23 +32,31 @@ class BioworksToPepXMLTest < Test::Unit::TestCase
|
|
33
32
|
end
|
34
33
|
|
35
34
|
def test_basic
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
35
|
+
if File.exist? @tfiles_l
|
36
|
+
cmd = "#{@cmd} -p #{@tf_params} -o #{@out_path} #{@tf_bioworks_xml} -m #{@tf_mzxml_path} -d /work/special/path"
|
37
|
+
prc = proc {|file|
|
38
|
+
assert(File.exist?(file), "#{file} exists")
|
39
|
+
}
|
40
|
+
_basic(cmd, prc)
|
41
|
+
unless @no_delete then FileUtils.rm_rf(@out_path) end
|
42
|
+
else
|
43
|
+
assert_nil( puts("--SKIPPING TEST-- (missing dir: #{@tfiles_l})") )
|
44
|
+
end
|
42
45
|
end
|
43
46
|
|
44
47
|
def test_database
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
|
51
|
-
|
52
|
-
|
53
|
-
|
48
|
+
if File.exist? @tfiles_l
|
49
|
+
cmd = "#{@cmd} -p #{@tf_params} -o #{@out_path} #{@tf_bioworks_xml} -m #{@tf_mzxml_path}"
|
50
|
+
db_re = /C:\\Xcalibur\\database\\ecoli_K12_ncbi_20060321.fasta/
|
51
|
+
assert_match(db_re, IO.read(@tf_params))
|
52
|
+
prc = proc {|file|
|
53
|
+
assert(File.exist?(file))
|
54
|
+
assert_no_match(db_re, IO.read(file))
|
55
|
+
}
|
56
|
+
_basic(cmd, prc)
|
57
|
+
unless @no_delete then FileUtils.rm_rf(@out_path) end
|
58
|
+
else
|
59
|
+
assert_nil( puts("--SKIPPING TEST-- (missing dir: #{@tfiles_l})") )
|
60
|
+
end
|
54
61
|
end
|
55
62
|
end
|
data/test/tc_fasta.rb
CHANGED
@@ -4,6 +4,8 @@ require File.dirname(File.expand_path(__FILE__)) + '/load_bin_path'
|
|
4
4
|
require 'test/unit'
|
5
5
|
require 'fasta'
|
6
6
|
require 'assert_files'
|
7
|
+
require 'sample_enzyme'
|
8
|
+
require 'set'
|
7
9
|
|
8
10
|
|
9
11
|
module Test::Unit::Assertions
|
@@ -11,6 +13,7 @@ module Test::Unit::Assertions
|
|
11
13
|
end
|
12
14
|
|
13
15
|
class FastaTest < Test::Unit::TestCase
|
16
|
+
NODELETE = false
|
14
17
|
|
15
18
|
def initialize(arg)
|
16
19
|
super(arg)
|
@@ -73,7 +76,7 @@ class FastaTest < Test::Unit::TestCase
|
|
73
76
|
|
74
77
|
def test_mod
|
75
78
|
## Testing shuffle:
|
76
|
-
|
79
|
+
`#{@fasta_mod_cmd + 'shuffle ' + @sf}`
|
77
80
|
assert(File.exist?(@sf_shuffle), "output file #{@sf_shuffle} exists")
|
78
81
|
ob1 = Fasta.new.read_file(@sf)
|
79
82
|
ob2 = Fasta.new.read_file(@sf_shuffle)
|
@@ -83,7 +86,7 @@ class FastaTest < Test::Unit::TestCase
|
|
83
86
|
assert(_are_shuffled?(ob1,ob2))
|
84
87
|
|
85
88
|
## Testing invert:
|
86
|
-
|
89
|
+
`#{@fasta_mod_cmd + 'invert ' + @sf}`
|
87
90
|
assert(File.exist?(@sf_invert), "output file #{@sf_invert} exists")
|
88
91
|
ob1 = Fasta.new.read_file(@sf)
|
89
92
|
ob2 = Fasta.new.read_file(@sf_invert)
|
@@ -94,7 +97,7 @@ class FastaTest < Test::Unit::TestCase
|
|
94
97
|
|
95
98
|
## Testing prefix
|
96
99
|
#puts "#{@fasta_mod_cmd + '-p _HELLO_ invert ' + @sf}"
|
97
|
-
|
100
|
+
`#{@fasta_mod_cmd + 'invert -p _HELLO_ ' + @sf}` # NOT WORKING!
|
98
101
|
assert(File.exist?(@sf_invert), "output file #{@sf_invert} exists")
|
99
102
|
ob1 = Fasta.new.read_file(@sf)
|
100
103
|
ob2 = Fasta.new.read_file(@sf_invert)
|
@@ -176,6 +179,81 @@ class FastaTest < Test::Unit::TestCase
|
|
176
179
|
end
|
177
180
|
end
|
178
181
|
|
182
|
+
def test_invert_tryptic_peptides
|
183
|
+
# FOR INDIVIDUAL PROTEINS:
|
184
|
+
seq = 'ABCKCDERDEKDGEKWXYRRKDER'
|
185
|
+
# tryptic = ABCK, CDER, DEK, DGEK, WXYR, R, K, DER
|
186
|
+
tryp = SampleEnzyme.tryptic(seq)
|
187
|
+
reverse_tryptic = %w(CBAK EDCR EDK EGDK YXWR R K EDR)
|
188
|
+
prot = Fasta::Prot.new(nil, seq)
|
189
|
+
prot.invert_tryptic_peptides!
|
190
|
+
assert_equal(reverse_tryptic.join(''), prot.aaseq, "reversing tryptic peptides")
|
191
|
+
|
192
|
+
seq = 'XYRABCD'
|
193
|
+
prot = Fasta::Prot.new(nil, seq)
|
194
|
+
prot.invert_tryptic_peptides!
|
195
|
+
assert_equal('YXRDCBA', prot.aaseq, 'last peptide treated special')
|
196
|
+
|
197
|
+
seq = 'XYRPABCD'
|
198
|
+
prot = Fasta::Prot.new(nil, seq)
|
199
|
+
prot.invert_tryptic_peptides!
|
200
|
+
assert_equal('DCBAPRYX', prot.aaseq, 'with a proline')
|
201
|
+
|
202
|
+
end
|
203
|
+
|
204
|
+
def test_fraction_of_prots
|
205
|
+
peps = [['>silly1', "PEPTIDE"], ['>silly2', "ANOTHER"], ['>silly3', "AGAIN"], ['>silly4', "LARMA"]]
|
206
|
+
prots = peps.map do |header, seq|
|
207
|
+
Fasta::Prot.new(header, seq)
|
208
|
+
end
|
209
|
+
f = Fasta.new(prots)
|
210
|
+
# simple:
|
211
|
+
n = f.fraction_of_prots(1.0)
|
212
|
+
assert_equal(f.prots.map{|v| v.header }.to_set, n.prots.map{|v| v.header }.to_set, "same headers")
|
213
|
+
assert_equal(f.prots.map{|v| v.aaseq }.to_set, n.prots.map{|v| v.aaseq }.to_set, "same aaseqs")
|
214
|
+
|
215
|
+
pre = proc {|cnt| "SHUFF_f#{cnt}_" }
|
216
|
+
# test prefix
|
217
|
+
n = f.fraction_of_prots(1.0, pre)
|
218
|
+
n.prots.each do |prot|
|
219
|
+
assert_match(/^>SHUFF_f0_/, prot.header, "contains new prefix")
|
220
|
+
end
|
221
|
+
|
222
|
+
# smaller
|
223
|
+
n = f.fraction_of_prots(0.75, pre)
|
224
|
+
assert_equal(3, n.prots.size, "correct number of proteins")
|
225
|
+
# bigger
|
226
|
+
n = f.fraction_of_prots(2.5, pre)
|
227
|
+
assert_equal(10, n.prots.size, "correct number of proteins")
|
228
|
+
n.prots[0..3].each {|prt| assert_match(/^>SHUFF_f0_/, prt.header ) }
|
229
|
+
n.prots[4..7].each {|prt| assert_match(/^>SHUFF_f1_/, prt.header ) }
|
230
|
+
n.prots[8..9].each {|prt| assert_match(/^>SHUFF_f2_/, prt.header ) }
|
231
|
+
# crazy
|
232
|
+
n = f.fraction_of_prots(1.33, pre)
|
233
|
+
assert_equal(6, n.prots.size, "correct number of proteins")
|
234
|
+
end
|
235
|
+
|
236
|
+
def test_inverted_tryptic_peptides_for_file
|
237
|
+
# for a file:
|
238
|
+
tmpfile = @tfiles + "fasta.tmp"
|
239
|
+
fasta = Fasta.new.read_file(@sf)
|
240
|
+
fasta.aaseq_invert_tryptic_peptides!
|
241
|
+
fasta.write_file(tmpfile)
|
242
|
+
lines = IO.readlines(tmpfile)
|
243
|
+
#normal = 'MKRISTTITTTITITTGNGAG'
|
244
|
+
inverted_tryptic = 'MKRGAGNGTTITITTTITTSI' ## ?????
|
245
|
+
assert_equal(inverted_tryptic, lines[1].chomp)
|
246
|
+
#normal = 'MATYLIGDVHGCYDELIALLHKVEFTPGKDTLWLTGDLVARGPGSLDVLRYVKSLGDSVRLVLGNHDLHL
|
247
|
+
# LAVFAGISRNKPKDRLTPLLEAPDADELLNWLRRQPLLQIDEEKKLVMAHAGITPQWDLQTAKECARDVE
|
248
|
+
# AVLSSDSYPFFLDAMYGDMPNNWSPELRGLGRLRFITNAFTRMRFCFPNGQLDMYSKESPEEAPAPLKPW
|
249
|
+
# FAIPGPVAEEYSIAFGHWASLEGKGTPEGIYALDTGCCWGGTLTCLRWEDKQYFVQPSNRHKDLGEAAAS'
|
250
|
+
inverted_tryptic = 'HLLAILEDYCGHVDGILYTAMKGPTFEVKAVLDGTLWLTDRLVDLSGPGRVYKVSDGLSRSIGAFVALLHLDHNGLVLRPKNKDRLWNLLEDADPAELLPTLRREEDIQLLPQKKATQLDWQPTIGAHAMVLKACERLEPSWNNPMDGYMADLFFPYSDSSLVAEVDRGLGRLRTFANTIFRMRSYMDLQGNPFCFKGELSAWHGFAISYEEAVPGPIAFWPKLPAPAEEPSEKLCTLTGGWCCGTDLAYIGEPTGRDEWKNSPQVFYQRHKSAAAEGLD'
|
251
|
+
assert_equal(inverted_tryptic, lines[-1].chomp)
|
252
|
+
File.unlink(tmpfile) unless NODELETE
|
253
|
+
end
|
254
|
+
|
255
|
+
|
256
|
+
|
179
257
|
## HELPER ASSERTIONS:
|
180
258
|
|
181
259
|
def _are_inverted?(obj1, obj2)
|
@@ -0,0 +1,147 @@
|
|
1
|
+
|
2
|
+
require 'test/unit'
|
3
|
+
require 'fasta'
|
4
|
+
|
5
|
+
Filestring = ">gi|P1
|
6
|
+
AMKRGAN
|
7
|
+
>gi|P2
|
8
|
+
CRGATKKTAGRPMEK
|
9
|
+
>gi|P3
|
10
|
+
PEPTIDE
|
11
|
+
"
|
12
|
+
|
13
|
+
Rev = ">gi|P1
|
14
|
+
NAGRKMA
|
15
|
+
>gi|P2
|
16
|
+
KEMPRGATKKTAGRC
|
17
|
+
>gi|P3
|
18
|
+
EDITPEP
|
19
|
+
"
|
20
|
+
|
21
|
+
RevTryptic = ">gi|P1
|
22
|
+
MAKRNAG
|
23
|
+
>gi|P2
|
24
|
+
CRTAGKKEMPRGATK
|
25
|
+
>gi|P3
|
26
|
+
EDITPEP
|
27
|
+
"
|
28
|
+
|
29
|
+
ShuffTryptic = ">gi|P1
|
30
|
+
MAKRNAG
|
31
|
+
>gi|P2
|
32
|
+
CRTAGKKEMPRGATK
|
33
|
+
>gi|P3
|
34
|
+
EDITPEP
|
35
|
+
"
|
36
|
+
|
37
|
+
|
38
|
+
|
39
|
+
class TestBasic < Test::Unit::TestCase
|
40
|
+
|
41
|
+
def setup
|
42
|
+
testdir = File.dirname(__FILE__)
|
43
|
+
libdir = testdir + '/../lib'
|
44
|
+
bindir = testdir + '/../bin'
|
45
|
+
progname = "fasta_shaker.rb"
|
46
|
+
@cmd = "ruby -I #{libdir} #{bindir}/#{progname} "
|
47
|
+
@tfiles = testdir + '/tfiles/'
|
48
|
+
@tmpfile = @tfiles + "littlefasta.trash.fasta"
|
49
|
+
File.open(@tmpfile, "w") {|fh| fh.print Filestring }
|
50
|
+
@f = @tfiles + "trash.fasta"
|
51
|
+
end
|
52
|
+
|
53
|
+
def teardown
|
54
|
+
File.unlink @tmpfile if File.exist? @tmpfile
|
55
|
+
File.unlink @f if File.exist? @f
|
56
|
+
end
|
57
|
+
|
58
|
+
def test_reverse
|
59
|
+
cmd = @cmd + "reverse #{@tmpfile} -o #{@f}"
|
60
|
+
system cmd
|
61
|
+
assert_equal(Rev, fastap(@f).to_s)
|
62
|
+
end
|
63
|
+
|
64
|
+
def test_reverse_tryptic
|
65
|
+
cmd = @cmd + "reverse #{@tmpfile} -o #{@f} --tryptic_peptides"
|
66
|
+
system cmd
|
67
|
+
assert_equal(RevTryptic, fastap(@f).to_s)
|
68
|
+
end
|
69
|
+
|
70
|
+
def test_shuff_tryptic
|
71
|
+
cmd = @cmd + "shuffle #{@tmpfile} -o #{@f} --tryptic_peptides"
|
72
|
+
system cmd
|
73
|
+
lns = fastap(@f).to_s.split("\n")
|
74
|
+
assert_equal('KR', lns[1][2..3])
|
75
|
+
assert_equal('R', lns[3][1..1])
|
76
|
+
assert_equal('CRGATKKTAGRPMEK'.size, lns[3].size, "sequence is same size")
|
77
|
+
assert_not_equal('CRGATKKTAGRPMEK', lns[3], "sequence is randomised from original [remote chance of failure] rerun to make sure")
|
78
|
+
end
|
79
|
+
|
80
|
+
def test_shuffle
|
81
|
+
cmd = @cmd + "shuffle #{@tmpfile} -o #{@f}"
|
82
|
+
system cmd
|
83
|
+
clines = strlns(Filestring)
|
84
|
+
lns = fastalns(@f)
|
85
|
+
lns.each_with_index do |line,i|
|
86
|
+
assert_equal(clines[i].size, line.size, "same size lines: A: <<#{clines[i]}>> B: <<#{line}>>")
|
87
|
+
end
|
88
|
+
assert_equal('CRGATKKTAGRPMEK'.size, lns[3].size, "sequence is same size")
|
89
|
+
assert_not_equal('CRGATKKTAGRPMEK', lns[3], "sequence is randomised from original [remote chance of failure] rerun to make sure")
|
90
|
+
end
|
91
|
+
|
92
|
+
def test_cat
|
93
|
+
cmd = @cmd + "reverse #{@tmpfile} -c -o #{@f}"
|
94
|
+
`#{cmd}` ## suppress warning
|
95
|
+
lns = fastalns(@f)
|
96
|
+
assert_equal(strlns(Filestring), lns[0..5], "first part equal")
|
97
|
+
assert_equal(strlns(Rev), lns[6..-1], "second part equal")
|
98
|
+
end
|
99
|
+
|
100
|
+
def test_fraction
|
101
|
+
cmd = @cmd + "reverse #{@tmpfile} -f 2.6 -o #{@f}"
|
102
|
+
`#{cmd}`
|
103
|
+
assert_equal(8, fastap(@f).size)
|
104
|
+
|
105
|
+
cmd = @cmd + "shuffle #{@tmpfile} -f 2.0 -c -p MINE_ -o #{@f}"
|
106
|
+
`#{cmd}`
|
107
|
+
assert_equal(9, fastap(@f).size)
|
108
|
+
fp = fastap(@f)
|
109
|
+
fp[0..2].each do |prt|
|
110
|
+
assert_match(/^>/, prt.header, "prefix matches")
|
111
|
+
end
|
112
|
+
fp[3..5].each do |prt|
|
113
|
+
assert_match(/^>MINE_f0_/, prt.header, "prefix matches")
|
114
|
+
end
|
115
|
+
fp[6..8].each do |prt|
|
116
|
+
assert_match(/^>MINE_f1_/, prt.header, "prefix matches")
|
117
|
+
end
|
118
|
+
#cmd = @cmd + "reverse #{@tmpfile} -c -f 2.0 -o #{@f}"
|
119
|
+
end
|
120
|
+
|
121
|
+
def test_prefix
|
122
|
+
cmd = @cmd + "reverse #{@tmpfile} -p SILLY_ -o #{@f}"
|
123
|
+
`#{cmd}`
|
124
|
+
fp = fastap(@f)
|
125
|
+
fp.each do |prt|
|
126
|
+
assert_match(/^>SILLY_.+/, prt.header)
|
127
|
+
end
|
128
|
+
end
|
129
|
+
|
130
|
+
|
131
|
+
private
|
132
|
+
def strlns(str)
|
133
|
+
str.split("\n")
|
134
|
+
end
|
135
|
+
|
136
|
+
def fastalns(fn)
|
137
|
+
assert(File.exist?(fn), "FILE: #{fn} exists")
|
138
|
+
IO.read(fn).split("\n")
|
139
|
+
end
|
140
|
+
|
141
|
+
# returns the fasta object proteins
|
142
|
+
def fastap(fn)
|
143
|
+
assert(File.exist?(fn), "FILE: #{fn} exists")
|
144
|
+
Fasta.new.read_file(fn).prots
|
145
|
+
end
|
146
|
+
|
147
|
+
end
|
data/test/tc_gi.rb
ADDED
@@ -0,0 +1,20 @@
|
|
1
|
+
|
2
|
+
require 'test/unit'
|
3
|
+
require 'gi'
|
4
|
+
|
5
|
+
|
6
|
+
class Gi2AnnotTest < Test::Unit::TestCase
|
7
|
+
ROOT_DIR = File.join(File.dirname(__FILE__), '..')
|
8
|
+
|
9
|
+
def test_single_query
|
10
|
+
#begin
|
11
|
+
annot = GI.gi2annot([16130548]).first
|
12
|
+
#rescue
|
13
|
+
puts "SKIPPING gi2annot test since no internet connection available:"
|
14
|
+
puts "#{$!}"
|
15
|
+
assert true
|
16
|
+
#else
|
17
|
+
assert_equal('CP4-57 prophage; RNase LS [Escherichia coli K12]'+"\n", annot)
|
18
|
+
#end
|
19
|
+
end
|
20
|
+
end
|
data/test/tc_id_class_anal.rb
CHANGED
@@ -23,14 +23,12 @@ class IDClassAnalTest < Test::Unit::TestCase
|
|
23
23
|
output = `#{@cmd} -p INV_ #{@tf_proph_inv}`
|
24
24
|
fps = [1.00, 1.00, 0.97]
|
25
25
|
tps = [1.00, 1.00, 0.98, 0.97, 0.97, 0.97, 0.97]
|
26
|
-
puts output
|
27
26
|
#File.open("tmp.csv","w") do |fh| fh.print output end
|
28
27
|
assert 1
|
29
28
|
end
|
30
29
|
|
31
|
-
def
|
30
|
+
def test_basic
|
32
31
|
output = `#{@cmd} -p INV_ #{@tf_bioworks_esmall_xml}`
|
33
|
-
# @TODO: that's the output, need to grab for consistency sake
|
34
32
|
exp = [
|
35
33
|
[1, 1.0, 0.0],
|
36
34
|
[2, 1.0, 0.0],
|
@@ -40,11 +38,11 @@ class IDClassAnalTest < Test::Unit::TestCase
|
|
40
38
|
[6, 1.0, 0.0],
|
41
39
|
[9, 1.0, 0.0],
|
42
40
|
[10, 1.0, 0.0],
|
43
|
-
[11, 0.
|
44
|
-
[12, 0.
|
45
|
-
[13, 0.
|
46
|
-
[14, 0.
|
47
|
-
[15, 0.
|
41
|
+
[11, 0.909090909090909],
|
42
|
+
[12, 0.916666666666667],
|
43
|
+
[13, 0.923076923076923],
|
44
|
+
[14, 0.928571428571429],
|
45
|
+
[15, 0.866666666666667],
|
48
46
|
]
|
49
47
|
outarr = output.split($/)
|
50
48
|
exp.each_with_index do |line,i|
|
@@ -55,18 +53,17 @@ class IDClassAnalTest < Test::Unit::TestCase
|
|
55
53
|
end
|
56
54
|
end
|
57
55
|
|
58
|
-
def
|
56
|
+
def test_multiple_output
|
59
57
|
myplot = 'class_anal.toplot'
|
60
58
|
output = `#{@cmd} -j -p INV_,SHUFF_ #{@tf_bioworks_esmall_xml} #{@tf_bioworks_shuff}`
|
61
59
|
assert(output.size > 10) ## @TODO: BETTER HERE
|
62
60
|
assert(File.exist?(myplot), "file #{myplot} exists")
|
63
61
|
File.unlink myplot
|
64
|
-
|
65
62
|
end
|
66
63
|
|
67
|
-
def
|
64
|
+
def test_jtplot_output
|
68
65
|
myplot = 'class_anal.toplot'
|
69
|
-
`#{@cmd} -p INV_ -j #{@tf_bioworks_esmall_xml}`
|
66
|
+
output = `#{@cmd} -p INV_ -j #{@tf_bioworks_esmall_xml}`
|
70
67
|
assert(File.exist?(myplot), "file #{myplot} exists")
|
71
68
|
File.unlink myplot
|
72
69
|
end
|