mspire 0.1.5 → 0.1.7
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/Rakefile +5 -2
- data/bin/bioworks_to_pepxml.rb +84 -40
- data/bin/fasta_shaker.rb +100 -0
- data/bin/filter_spec_id.rb +185 -23
- data/bin/gi2annot.rb +2 -110
- data/bin/id_class_anal.rb +31 -21
- data/bin/id_precision.rb +12 -8
- data/bin/{false_positive_rate.rb → precision.rb} +1 -1
- data/bin/protein_summary.rb +55 -62
- data/changelog.txt +34 -0
- data/lib/align.rb +0 -1
- data/lib/fasta.rb +88 -24
- data/lib/gi.rb +114 -0
- data/lib/roc.rb +64 -58
- data/lib/spec_id/aa_freqs.rb +166 -0
- data/lib/spec_id/bioworks.rb +5 -1
- data/lib/spec_id/precision.rb +427 -0
- data/lib/spec_id/proph.rb +2 -2
- data/lib/spec_id/sequest.rb +810 -113
- data/lib/spec_id/srf.rb +486 -0
- data/lib/spec_id.rb +107 -23
- data/release_notes.txt +11 -0
- data/script/estimate_fpr_by_cysteine.rb +226 -0
- data/script/filter-peps.rb +3 -3
- data/script/find_cysteine_background.rb +137 -0
- data/script/gen_database_searching.rb +11 -7
- data/script/genuine_tps_and_probs.rb +136 -0
- data/script/top_hit_per_scan.rb +5 -2
- data/test/tc_aa_freqs.rb +59 -0
- data/test/tc_bioworks.rb +6 -1
- data/test/tc_bioworks_to_pepxml.rb +25 -18
- data/test/tc_fasta.rb +81 -3
- data/test/tc_fasta_shaker.rb +147 -0
- data/test/tc_gi.rb +20 -0
- data/test/tc_id_class_anal.rb +9 -12
- data/test/tc_id_precision.rb +12 -11
- data/test/{tc_false_positive_rate.rb → tc_precision.rb} +13 -22
- data/test/tc_protein_summary.rb +31 -22
- data/test/tc_roc.rb +95 -50
- data/test/tc_sequest.rb +212 -145
- data/test/tc_spec.rb +10 -5
- data/test/tc_spec_id.rb +0 -2
- data/test/tc_spec_id_xml.rb +36 -0
- data/test/tc_srf.rb +216 -0
- metadata +35 -21
- data/lib/spec_id/false_positive_rate.rb +0 -476
- data/test/tc_gi2annot.rb +0 -12
@@ -0,0 +1,136 @@
|
|
1
|
+
#!/usr/bin/ruby -w
|
2
|
+
|
3
|
+
# Here is what I plotted: take each id'd pep-prot id'd on the tophit scans -- you will likely have the same pep-prot id'd on multiple scans -- plot the top probability of each such pep-prot.
|
4
|
+
# There are 43 such id'd peptides for Sashimi, whereas SEQUEST id's about 66. So you'll have 66 (1-p-values) to plot, I had 43. Similarly for OMICS.
|
5
|
+
|
6
|
+
require 'spec_id'
|
7
|
+
require 'fasta'
|
8
|
+
require 'optparse'
|
9
|
+
require 'ostruct'
|
10
|
+
|
11
|
+
# returns an accession number if available, or the entire reference (less the
|
12
|
+
# starting '>'
|
13
|
+
def get_fasta_accession(fasta_prot)
|
14
|
+
head = fasta_prot.header
|
15
|
+
if head =~ ACC_REGEX
|
16
|
+
$1.dup
|
17
|
+
else
|
18
|
+
head.sub(/^>/, '').rstrip
|
19
|
+
end
|
20
|
+
end
|
21
|
+
|
22
|
+
# returns the accession number from a reference, or the complete reference
|
23
|
+
def accession_from_ref(pep)
|
24
|
+
ref = pep.prot.reference
|
25
|
+
if ref =~ ACC_REGEX
|
26
|
+
$1.dup
|
27
|
+
else
|
28
|
+
ref.rstrip
|
29
|
+
end
|
30
|
+
end
|
31
|
+
|
32
|
+
def get_pep_prot_accession(pep)
|
33
|
+
acc = pep.prot.accession
|
34
|
+
if !acc || acc == '0' || acc == 0
|
35
|
+
accession_from_ref(pep)
|
36
|
+
else
|
37
|
+
acc
|
38
|
+
end
|
39
|
+
end
|
40
|
+
|
41
|
+
#####################################################################
|
42
|
+
# MAIN
|
43
|
+
#####################################################################
|
44
|
+
|
45
|
+
opt = OpenStruct.new
|
46
|
+
opt.p = 'prob'
|
47
|
+
opts = OptionParser.new do |op|
|
48
|
+
op.banner = "usage: #{File.basename(__FILE__)} bioworks.xml true_hits.fasta"
|
49
|
+
op.separator " [prints to stdout tab delimited table]"
|
50
|
+
op.on('-t', '--ties', 'allow ties on best hit') {|v| opt.t = v }
|
51
|
+
op.on('-p', '--param <s>', 'param: (xcorr | prob)') {|v| opt.p = v}
|
52
|
+
end
|
53
|
+
opts.parse!
|
54
|
+
|
55
|
+
if ARGV.size < 2
|
56
|
+
puts opts
|
57
|
+
exit
|
58
|
+
end
|
59
|
+
|
60
|
+
case opt.p
|
61
|
+
when 'prob'
|
62
|
+
param = :peptide_probability
|
63
|
+
best = :first
|
64
|
+
when 'xcorr'
|
65
|
+
param = :xcorr
|
66
|
+
best = :last
|
67
|
+
else
|
68
|
+
abort "incorrect param: #{opt.p}"
|
69
|
+
end
|
70
|
+
|
71
|
+
############################
|
72
|
+
# GLOBALS
|
73
|
+
DELIM = "\t"
|
74
|
+
ACC_REGEX = /\|(.*?)\|/o
|
75
|
+
############################
|
76
|
+
|
77
|
+
bioworks = ARGV[0]
|
78
|
+
fasta_file = ARGV[1]
|
79
|
+
|
80
|
+
fprots = Fasta.new.read_file(fasta_file).prots
|
81
|
+
gi_nums = fprots.map {|prot| get_fasta_accession(prot) }
|
82
|
+
|
83
|
+
peptides = SpecID.new(bioworks).peps
|
84
|
+
|
85
|
+
|
86
|
+
## Get the best peptide(s) per scan
|
87
|
+
top_peps_per_scan = []
|
88
|
+
|
89
|
+
peptides.hash_by(:base_name, :first_scan).each do |bn_scan, pep_array|
|
90
|
+
sorted_list = pep_array.sort_by {|pep| pep.send(param).to_f }
|
91
|
+
|
92
|
+
top_peps = if best == :first ; [sorted_list.shift] ; else [sorted_list.pop] end
|
93
|
+
found_another = false
|
94
|
+
sorted_list.each do |pep|
|
95
|
+
if pep.send(param).to_f == top_peps.send(best).send(param).to_f
|
96
|
+
if opt.t
|
97
|
+
top_peps << pep
|
98
|
+
else
|
99
|
+
found_another = true
|
100
|
+
end
|
101
|
+
end
|
102
|
+
end
|
103
|
+
unless found_another
|
104
|
+
top_peps_per_scan.push( *top_peps )
|
105
|
+
end
|
106
|
+
end
|
107
|
+
|
108
|
+
|
109
|
+
## Get the best scoring peptide per peptide/prot from list of best
|
110
|
+
## peptides/scan
|
111
|
+
top_pep_seq_prots = top_peps_per_scan.hash_by {|pep| [pep.sequence, get_pep_prot_accession(pep)] }.map do |k,pep_array|
|
112
|
+
pep_array.sort_by {|pep| pep.send(param).to_f }.send(best)
|
113
|
+
end
|
114
|
+
|
115
|
+
## sort the peptides by best score
|
116
|
+
sorted_top_pep_seq_prots = top_pep_seq_prots.sort_by {|pep| pep.send(param).to_f }
|
117
|
+
if best == :last ; sorted_top_pep_seq_prots.reverse! end
|
118
|
+
|
119
|
+
## plot the probability vs. the number of tps
|
120
|
+
puts ['#TPs', param, 'sequence', 'protein accession', 'xcorr'].join(DELIM)
|
121
|
+
tps = 0
|
122
|
+
sorted_top_pep_seq_prots.each do |pep|
|
123
|
+
if gi_nums.include?( get_pep_prot_accession(pep) )
|
124
|
+
tps += 1
|
125
|
+
puts [tps.to_s, pep.send(param), pep.sequence, get_pep_prot_accession(pep), pep.xcorr].join(DELIM)
|
126
|
+
end
|
127
|
+
end
|
128
|
+
|
129
|
+
|
130
|
+
|
131
|
+
|
132
|
+
|
133
|
+
|
134
|
+
|
135
|
+
|
136
|
+
|
data/script/top_hit_per_scan.rb
CHANGED
@@ -4,7 +4,6 @@
|
|
4
4
|
cats = %w(base_name sequence xcorr deltacn first_scan last_scan)
|
5
5
|
###################################################################
|
6
6
|
|
7
|
-
require 'pp'
|
8
7
|
require 'spec_id'
|
9
8
|
require 'hash_by'
|
10
9
|
|
@@ -46,7 +45,11 @@ outfile_top = file.sub(/\.xml$/, extension_top)
|
|
46
45
|
outfile_all = file.sub(/\.xml$/, extension_all)
|
47
46
|
|
48
47
|
sp = SpecID.new(file)
|
49
|
-
|
48
|
+
|
49
|
+
# The old (incorrect version)
|
50
|
+
# pep_hash = sp.peps.hash_by(:first_scan, :last_scan)
|
51
|
+
# The correct version:
|
52
|
+
pep_hash = sp.peps.hash_by(:base_name, :first_scan, :last_scan)
|
50
53
|
top_per_scan = pep_hash.map {|k,v| v.sort_by {|ob| ob.xcorr.to_f }.last }
|
51
54
|
top_per_scan = top_per_scan.sort_by {|pep| pep.first_scan.to_i }
|
52
55
|
|
data/test/tc_aa_freqs.rb
ADDED
@@ -0,0 +1,59 @@
|
|
1
|
+
|
2
|
+
|
3
|
+
require 'test/unit'
|
4
|
+
require 'spec_id/aa_freqs'
|
5
|
+
|
6
|
+
|
7
|
+
class FastaTest < Test::Unit::TestCase
|
8
|
+
|
9
|
+
def initialize(arg)
|
10
|
+
super(arg)
|
11
|
+
@tfiles = File.dirname(__FILE__) + '/tfiles/'
|
12
|
+
@sf = @tfiles + "small.fasta"
|
13
|
+
end
|
14
|
+
|
15
|
+
def test_basic
|
16
|
+
obj = SpecID::AAFreqs.new(@sf)
|
17
|
+
expect = {:I=>0.0628918621937819, :S=>0.0539719475147049, :D=>0.0526145691939758, :Z=>0.0, :L=>0.102772929998061, :T=>0.0491888048607071, :E=>0.0609527503070261, :O=>0.0, :C=>0.0157714433456144, :K=>0.0471850559110594, :U=>0.0, :Q=>0.0382651412319824, :W=>0.0137030573330748, :A=>0.101997285243359, :M=>0.0294745006786892, :J=>0.0, :G=>0.0811195139292871, :Y=>0.0254670027793937, :X=>0.0, :F=>0.0418201796910348, :R=>0.0546829552065154, :V=>0.0702604873634542, :H=>0.0213302307543145, :B=>0.0, :N=>0.03471010277293, :P=>0.0418201796910348}
|
18
|
+
aaf = obj.aafreqs
|
19
|
+
expect.each do |k,v|
|
20
|
+
assert(aaf.key?(k))
|
21
|
+
assert_in_delta(v, aaf[k], 0.00000001, "freqs match up")
|
22
|
+
end
|
23
|
+
sum = 0.0
|
24
|
+
aaf.values.each do |v|
|
25
|
+
sum += v
|
26
|
+
end
|
27
|
+
assert_in_delta(1.0, sum, 0.0000000000001, "all freqs add to 1")
|
28
|
+
end
|
29
|
+
|
30
|
+
def test_probability_of_length_table
|
31
|
+
# p SpecID::AAFreqs.probability_of_length_table(0.01, 4)
|
32
|
+
assert_equal_arrs_in_delta([0.0, 0.01, 0.0199, 0.029701, 0.0394039900000001], SpecID::AAFreqs.probability_of_length_table(0.01, 4), 0.000000001)
|
33
|
+
|
34
|
+
assert_equal_arrs_in_delta([0.0, 0.2, 0.36, 0.488, 0.5904], SpecID::AAFreqs.probability_of_length_table(0.2, 4), 0.000000001)
|
35
|
+
end
|
36
|
+
|
37
|
+
def test_actual_and_expected_number
|
38
|
+
fobj = Fasta.new.read_file(@sf)
|
39
|
+
obj = SpecID::AAFreqs.new
|
40
|
+
obj.aafreqs = obj.calculate_frequencies(fobj)
|
41
|
+
|
42
|
+
peptide_aaseqs = fobj.prots.map do |prot|
|
43
|
+
prot.aaseq[0..12]
|
44
|
+
end
|
45
|
+
assert_equal(50, peptide_aaseqs.size, 'sanity check')
|
46
|
+
(ac,ex) = obj.actual_and_expected_number(peptide_aaseqs, :C, 1)
|
47
|
+
assert_equal(9, ac)
|
48
|
+
assert_in_delta( 9.33530631238985, ex, 0.0000000001)
|
49
|
+
end
|
50
|
+
|
51
|
+
private
|
52
|
+
def assert_equal_arrs_in_delta(expect, actual, delta)
|
53
|
+
expect.each_with_index do |v,i|
|
54
|
+
assert_in_delta(v, actual[i], delta)
|
55
|
+
end
|
56
|
+
end
|
57
|
+
|
58
|
+
|
59
|
+
end
|
data/test/tc_bioworks.rb
CHANGED
@@ -8,7 +8,8 @@ class BioworksTest < Test::Unit::TestCase
|
|
8
8
|
def initialize(arg)
|
9
9
|
super(arg)
|
10
10
|
@tfiles = File.dirname(__FILE__) + '/tfiles/'
|
11
|
-
@
|
11
|
+
@tfiles_l = File.dirname(__FILE__) + '/tfiles_large/'
|
12
|
+
@tf_bioworks_xml = @tfiles_l + "bioworks.xml"
|
12
13
|
@tf_bioworks_xml_small = @tfiles + "bioworks_small.xml"
|
13
14
|
@tf_bioworks_xml_really_small = @tfiles + "bioworks_with_INV_small.xml"
|
14
15
|
@tf_params = @tfiles + "bioworks32.params"
|
@@ -34,9 +35,13 @@ class BioworksTest < Test::Unit::TestCase
|
|
34
35
|
end
|
35
36
|
|
36
37
|
def Xtest_xml_parsing_speed
|
38
|
+
if File.exist? @tfiles_l
|
37
39
|
#puts Benchmark.bm {|b|
|
38
40
|
obj = SpecID::Bioworks.new(@tf_bioworks_xml)
|
39
41
|
#}
|
42
|
+
else
|
43
|
+
assert_nil( puts("--SKIPPING TEST-- (missing dir: #{@tfiles_l})") )
|
44
|
+
end
|
40
45
|
end
|
41
46
|
|
42
47
|
def test_xml_parsing_bioworks_single
|
@@ -9,11 +9,10 @@ class BioworksToPepXMLTest < Test::Unit::TestCase
|
|
9
9
|
def initialize(arg)
|
10
10
|
super(arg)
|
11
11
|
@tfiles = File.dirname(__FILE__) + '/tfiles/'
|
12
|
-
@
|
12
|
+
@tfiles_l = File.dirname(__FILE__) + '/tfiles_large/'
|
13
|
+
@tf_mzxml_path = @tfiles_l + "yeast_gly_mzXML"
|
13
14
|
@tf_bioworks_xml = @tfiles + "bioworks_small.xml"
|
14
15
|
@tf_params = @tfiles + "bioworks32.params"
|
15
|
-
@tf_opd1 = @tfiles + "opd1/bioworks.000.oldparams.xml"
|
16
|
-
@tf_opd1_mzxml = @tfiles + "opd1/000.mzXML.timeIndex"
|
17
16
|
@no_delete = false
|
18
17
|
@out_path = @tfiles + 'pepxml/'
|
19
18
|
@cmd = "ruby -I#{File.join(File.dirname(__FILE__), "..", "lib")} -S bioworks_to_pepxml.rb "
|
@@ -33,23 +32,31 @@ class BioworksToPepXMLTest < Test::Unit::TestCase
|
|
33
32
|
end
|
34
33
|
|
35
34
|
def test_basic
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
35
|
+
if File.exist? @tfiles_l
|
36
|
+
cmd = "#{@cmd} -p #{@tf_params} -o #{@out_path} #{@tf_bioworks_xml} -m #{@tf_mzxml_path} -d /work/special/path"
|
37
|
+
prc = proc {|file|
|
38
|
+
assert(File.exist?(file), "#{file} exists")
|
39
|
+
}
|
40
|
+
_basic(cmd, prc)
|
41
|
+
unless @no_delete then FileUtils.rm_rf(@out_path) end
|
42
|
+
else
|
43
|
+
assert_nil( puts("--SKIPPING TEST-- (missing dir: #{@tfiles_l})") )
|
44
|
+
end
|
42
45
|
end
|
43
46
|
|
44
47
|
def test_database
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
|
51
|
-
|
52
|
-
|
53
|
-
|
48
|
+
if File.exist? @tfiles_l
|
49
|
+
cmd = "#{@cmd} -p #{@tf_params} -o #{@out_path} #{@tf_bioworks_xml} -m #{@tf_mzxml_path}"
|
50
|
+
db_re = /C:\\Xcalibur\\database\\ecoli_K12_ncbi_20060321.fasta/
|
51
|
+
assert_match(db_re, IO.read(@tf_params))
|
52
|
+
prc = proc {|file|
|
53
|
+
assert(File.exist?(file))
|
54
|
+
assert_no_match(db_re, IO.read(file))
|
55
|
+
}
|
56
|
+
_basic(cmd, prc)
|
57
|
+
unless @no_delete then FileUtils.rm_rf(@out_path) end
|
58
|
+
else
|
59
|
+
assert_nil( puts("--SKIPPING TEST-- (missing dir: #{@tfiles_l})") )
|
60
|
+
end
|
54
61
|
end
|
55
62
|
end
|
data/test/tc_fasta.rb
CHANGED
@@ -4,6 +4,8 @@ require File.dirname(File.expand_path(__FILE__)) + '/load_bin_path'
|
|
4
4
|
require 'test/unit'
|
5
5
|
require 'fasta'
|
6
6
|
require 'assert_files'
|
7
|
+
require 'sample_enzyme'
|
8
|
+
require 'set'
|
7
9
|
|
8
10
|
|
9
11
|
module Test::Unit::Assertions
|
@@ -11,6 +13,7 @@ module Test::Unit::Assertions
|
|
11
13
|
end
|
12
14
|
|
13
15
|
class FastaTest < Test::Unit::TestCase
|
16
|
+
NODELETE = false
|
14
17
|
|
15
18
|
def initialize(arg)
|
16
19
|
super(arg)
|
@@ -73,7 +76,7 @@ class FastaTest < Test::Unit::TestCase
|
|
73
76
|
|
74
77
|
def test_mod
|
75
78
|
## Testing shuffle:
|
76
|
-
|
79
|
+
`#{@fasta_mod_cmd + 'shuffle ' + @sf}`
|
77
80
|
assert(File.exist?(@sf_shuffle), "output file #{@sf_shuffle} exists")
|
78
81
|
ob1 = Fasta.new.read_file(@sf)
|
79
82
|
ob2 = Fasta.new.read_file(@sf_shuffle)
|
@@ -83,7 +86,7 @@ class FastaTest < Test::Unit::TestCase
|
|
83
86
|
assert(_are_shuffled?(ob1,ob2))
|
84
87
|
|
85
88
|
## Testing invert:
|
86
|
-
|
89
|
+
`#{@fasta_mod_cmd + 'invert ' + @sf}`
|
87
90
|
assert(File.exist?(@sf_invert), "output file #{@sf_invert} exists")
|
88
91
|
ob1 = Fasta.new.read_file(@sf)
|
89
92
|
ob2 = Fasta.new.read_file(@sf_invert)
|
@@ -94,7 +97,7 @@ class FastaTest < Test::Unit::TestCase
|
|
94
97
|
|
95
98
|
## Testing prefix
|
96
99
|
#puts "#{@fasta_mod_cmd + '-p _HELLO_ invert ' + @sf}"
|
97
|
-
|
100
|
+
`#{@fasta_mod_cmd + 'invert -p _HELLO_ ' + @sf}` # NOT WORKING!
|
98
101
|
assert(File.exist?(@sf_invert), "output file #{@sf_invert} exists")
|
99
102
|
ob1 = Fasta.new.read_file(@sf)
|
100
103
|
ob2 = Fasta.new.read_file(@sf_invert)
|
@@ -176,6 +179,81 @@ class FastaTest < Test::Unit::TestCase
|
|
176
179
|
end
|
177
180
|
end
|
178
181
|
|
182
|
+
def test_invert_tryptic_peptides
|
183
|
+
# FOR INDIVIDUAL PROTEINS:
|
184
|
+
seq = 'ABCKCDERDEKDGEKWXYRRKDER'
|
185
|
+
# tryptic = ABCK, CDER, DEK, DGEK, WXYR, R, K, DER
|
186
|
+
tryp = SampleEnzyme.tryptic(seq)
|
187
|
+
reverse_tryptic = %w(CBAK EDCR EDK EGDK YXWR R K EDR)
|
188
|
+
prot = Fasta::Prot.new(nil, seq)
|
189
|
+
prot.invert_tryptic_peptides!
|
190
|
+
assert_equal(reverse_tryptic.join(''), prot.aaseq, "reversing tryptic peptides")
|
191
|
+
|
192
|
+
seq = 'XYRABCD'
|
193
|
+
prot = Fasta::Prot.new(nil, seq)
|
194
|
+
prot.invert_tryptic_peptides!
|
195
|
+
assert_equal('YXRDCBA', prot.aaseq, 'last peptide treated special')
|
196
|
+
|
197
|
+
seq = 'XYRPABCD'
|
198
|
+
prot = Fasta::Prot.new(nil, seq)
|
199
|
+
prot.invert_tryptic_peptides!
|
200
|
+
assert_equal('DCBAPRYX', prot.aaseq, 'with a proline')
|
201
|
+
|
202
|
+
end
|
203
|
+
|
204
|
+
def test_fraction_of_prots
|
205
|
+
peps = [['>silly1', "PEPTIDE"], ['>silly2', "ANOTHER"], ['>silly3', "AGAIN"], ['>silly4', "LARMA"]]
|
206
|
+
prots = peps.map do |header, seq|
|
207
|
+
Fasta::Prot.new(header, seq)
|
208
|
+
end
|
209
|
+
f = Fasta.new(prots)
|
210
|
+
# simple:
|
211
|
+
n = f.fraction_of_prots(1.0)
|
212
|
+
assert_equal(f.prots.map{|v| v.header }.to_set, n.prots.map{|v| v.header }.to_set, "same headers")
|
213
|
+
assert_equal(f.prots.map{|v| v.aaseq }.to_set, n.prots.map{|v| v.aaseq }.to_set, "same aaseqs")
|
214
|
+
|
215
|
+
pre = proc {|cnt| "SHUFF_f#{cnt}_" }
|
216
|
+
# test prefix
|
217
|
+
n = f.fraction_of_prots(1.0, pre)
|
218
|
+
n.prots.each do |prot|
|
219
|
+
assert_match(/^>SHUFF_f0_/, prot.header, "contains new prefix")
|
220
|
+
end
|
221
|
+
|
222
|
+
# smaller
|
223
|
+
n = f.fraction_of_prots(0.75, pre)
|
224
|
+
assert_equal(3, n.prots.size, "correct number of proteins")
|
225
|
+
# bigger
|
226
|
+
n = f.fraction_of_prots(2.5, pre)
|
227
|
+
assert_equal(10, n.prots.size, "correct number of proteins")
|
228
|
+
n.prots[0..3].each {|prt| assert_match(/^>SHUFF_f0_/, prt.header ) }
|
229
|
+
n.prots[4..7].each {|prt| assert_match(/^>SHUFF_f1_/, prt.header ) }
|
230
|
+
n.prots[8..9].each {|prt| assert_match(/^>SHUFF_f2_/, prt.header ) }
|
231
|
+
# crazy
|
232
|
+
n = f.fraction_of_prots(1.33, pre)
|
233
|
+
assert_equal(6, n.prots.size, "correct number of proteins")
|
234
|
+
end
|
235
|
+
|
236
|
+
def test_inverted_tryptic_peptides_for_file
|
237
|
+
# for a file:
|
238
|
+
tmpfile = @tfiles + "fasta.tmp"
|
239
|
+
fasta = Fasta.new.read_file(@sf)
|
240
|
+
fasta.aaseq_invert_tryptic_peptides!
|
241
|
+
fasta.write_file(tmpfile)
|
242
|
+
lines = IO.readlines(tmpfile)
|
243
|
+
#normal = 'MKRISTTITTTITITTGNGAG'
|
244
|
+
inverted_tryptic = 'MKRGAGNGTTITITTTITTSI' ## ?????
|
245
|
+
assert_equal(inverted_tryptic, lines[1].chomp)
|
246
|
+
#normal = 'MATYLIGDVHGCYDELIALLHKVEFTPGKDTLWLTGDLVARGPGSLDVLRYVKSLGDSVRLVLGNHDLHL
|
247
|
+
# LAVFAGISRNKPKDRLTPLLEAPDADELLNWLRRQPLLQIDEEKKLVMAHAGITPQWDLQTAKECARDVE
|
248
|
+
# AVLSSDSYPFFLDAMYGDMPNNWSPELRGLGRLRFITNAFTRMRFCFPNGQLDMYSKESPEEAPAPLKPW
|
249
|
+
# FAIPGPVAEEYSIAFGHWASLEGKGTPEGIYALDTGCCWGGTLTCLRWEDKQYFVQPSNRHKDLGEAAAS'
|
250
|
+
inverted_tryptic = 'HLLAILEDYCGHVDGILYTAMKGPTFEVKAVLDGTLWLTDRLVDLSGPGRVYKVSDGLSRSIGAFVALLHLDHNGLVLRPKNKDRLWNLLEDADPAELLPTLRREEDIQLLPQKKATQLDWQPTIGAHAMVLKACERLEPSWNNPMDGYMADLFFPYSDSSLVAEVDRGLGRLRTFANTIFRMRSYMDLQGNPFCFKGELSAWHGFAISYEEAVPGPIAFWPKLPAPAEEPSEKLCTLTGGWCCGTDLAYIGEPTGRDEWKNSPQVFYQRHKSAAAEGLD'
|
251
|
+
assert_equal(inverted_tryptic, lines[-1].chomp)
|
252
|
+
File.unlink(tmpfile) unless NODELETE
|
253
|
+
end
|
254
|
+
|
255
|
+
|
256
|
+
|
179
257
|
## HELPER ASSERTIONS:
|
180
258
|
|
181
259
|
def _are_inverted?(obj1, obj2)
|
@@ -0,0 +1,147 @@
|
|
1
|
+
|
2
|
+
require 'test/unit'
|
3
|
+
require 'fasta'
|
4
|
+
|
5
|
+
Filestring = ">gi|P1
|
6
|
+
AMKRGAN
|
7
|
+
>gi|P2
|
8
|
+
CRGATKKTAGRPMEK
|
9
|
+
>gi|P3
|
10
|
+
PEPTIDE
|
11
|
+
"
|
12
|
+
|
13
|
+
Rev = ">gi|P1
|
14
|
+
NAGRKMA
|
15
|
+
>gi|P2
|
16
|
+
KEMPRGATKKTAGRC
|
17
|
+
>gi|P3
|
18
|
+
EDITPEP
|
19
|
+
"
|
20
|
+
|
21
|
+
RevTryptic = ">gi|P1
|
22
|
+
MAKRNAG
|
23
|
+
>gi|P2
|
24
|
+
CRTAGKKEMPRGATK
|
25
|
+
>gi|P3
|
26
|
+
EDITPEP
|
27
|
+
"
|
28
|
+
|
29
|
+
ShuffTryptic = ">gi|P1
|
30
|
+
MAKRNAG
|
31
|
+
>gi|P2
|
32
|
+
CRTAGKKEMPRGATK
|
33
|
+
>gi|P3
|
34
|
+
EDITPEP
|
35
|
+
"
|
36
|
+
|
37
|
+
|
38
|
+
|
39
|
+
class TestBasic < Test::Unit::TestCase
|
40
|
+
|
41
|
+
def setup
|
42
|
+
testdir = File.dirname(__FILE__)
|
43
|
+
libdir = testdir + '/../lib'
|
44
|
+
bindir = testdir + '/../bin'
|
45
|
+
progname = "fasta_shaker.rb"
|
46
|
+
@cmd = "ruby -I #{libdir} #{bindir}/#{progname} "
|
47
|
+
@tfiles = testdir + '/tfiles/'
|
48
|
+
@tmpfile = @tfiles + "littlefasta.trash.fasta"
|
49
|
+
File.open(@tmpfile, "w") {|fh| fh.print Filestring }
|
50
|
+
@f = @tfiles + "trash.fasta"
|
51
|
+
end
|
52
|
+
|
53
|
+
def teardown
|
54
|
+
File.unlink @tmpfile if File.exist? @tmpfile
|
55
|
+
File.unlink @f if File.exist? @f
|
56
|
+
end
|
57
|
+
|
58
|
+
def test_reverse
|
59
|
+
cmd = @cmd + "reverse #{@tmpfile} -o #{@f}"
|
60
|
+
system cmd
|
61
|
+
assert_equal(Rev, fastap(@f).to_s)
|
62
|
+
end
|
63
|
+
|
64
|
+
def test_reverse_tryptic
|
65
|
+
cmd = @cmd + "reverse #{@tmpfile} -o #{@f} --tryptic_peptides"
|
66
|
+
system cmd
|
67
|
+
assert_equal(RevTryptic, fastap(@f).to_s)
|
68
|
+
end
|
69
|
+
|
70
|
+
def test_shuff_tryptic
|
71
|
+
cmd = @cmd + "shuffle #{@tmpfile} -o #{@f} --tryptic_peptides"
|
72
|
+
system cmd
|
73
|
+
lns = fastap(@f).to_s.split("\n")
|
74
|
+
assert_equal('KR', lns[1][2..3])
|
75
|
+
assert_equal('R', lns[3][1..1])
|
76
|
+
assert_equal('CRGATKKTAGRPMEK'.size, lns[3].size, "sequence is same size")
|
77
|
+
assert_not_equal('CRGATKKTAGRPMEK', lns[3], "sequence is randomised from original [remote chance of failure] rerun to make sure")
|
78
|
+
end
|
79
|
+
|
80
|
+
def test_shuffle
|
81
|
+
cmd = @cmd + "shuffle #{@tmpfile} -o #{@f}"
|
82
|
+
system cmd
|
83
|
+
clines = strlns(Filestring)
|
84
|
+
lns = fastalns(@f)
|
85
|
+
lns.each_with_index do |line,i|
|
86
|
+
assert_equal(clines[i].size, line.size, "same size lines: A: <<#{clines[i]}>> B: <<#{line}>>")
|
87
|
+
end
|
88
|
+
assert_equal('CRGATKKTAGRPMEK'.size, lns[3].size, "sequence is same size")
|
89
|
+
assert_not_equal('CRGATKKTAGRPMEK', lns[3], "sequence is randomised from original [remote chance of failure] rerun to make sure")
|
90
|
+
end
|
91
|
+
|
92
|
+
def test_cat
|
93
|
+
cmd = @cmd + "reverse #{@tmpfile} -c -o #{@f}"
|
94
|
+
`#{cmd}` ## suppress warning
|
95
|
+
lns = fastalns(@f)
|
96
|
+
assert_equal(strlns(Filestring), lns[0..5], "first part equal")
|
97
|
+
assert_equal(strlns(Rev), lns[6..-1], "second part equal")
|
98
|
+
end
|
99
|
+
|
100
|
+
def test_fraction
|
101
|
+
cmd = @cmd + "reverse #{@tmpfile} -f 2.6 -o #{@f}"
|
102
|
+
`#{cmd}`
|
103
|
+
assert_equal(8, fastap(@f).size)
|
104
|
+
|
105
|
+
cmd = @cmd + "shuffle #{@tmpfile} -f 2.0 -c -p MINE_ -o #{@f}"
|
106
|
+
`#{cmd}`
|
107
|
+
assert_equal(9, fastap(@f).size)
|
108
|
+
fp = fastap(@f)
|
109
|
+
fp[0..2].each do |prt|
|
110
|
+
assert_match(/^>/, prt.header, "prefix matches")
|
111
|
+
end
|
112
|
+
fp[3..5].each do |prt|
|
113
|
+
assert_match(/^>MINE_f0_/, prt.header, "prefix matches")
|
114
|
+
end
|
115
|
+
fp[6..8].each do |prt|
|
116
|
+
assert_match(/^>MINE_f1_/, prt.header, "prefix matches")
|
117
|
+
end
|
118
|
+
#cmd = @cmd + "reverse #{@tmpfile} -c -f 2.0 -o #{@f}"
|
119
|
+
end
|
120
|
+
|
121
|
+
def test_prefix
|
122
|
+
cmd = @cmd + "reverse #{@tmpfile} -p SILLY_ -o #{@f}"
|
123
|
+
`#{cmd}`
|
124
|
+
fp = fastap(@f)
|
125
|
+
fp.each do |prt|
|
126
|
+
assert_match(/^>SILLY_.+/, prt.header)
|
127
|
+
end
|
128
|
+
end
|
129
|
+
|
130
|
+
|
131
|
+
private
|
132
|
+
def strlns(str)
|
133
|
+
str.split("\n")
|
134
|
+
end
|
135
|
+
|
136
|
+
def fastalns(fn)
|
137
|
+
assert(File.exist?(fn), "FILE: #{fn} exists")
|
138
|
+
IO.read(fn).split("\n")
|
139
|
+
end
|
140
|
+
|
141
|
+
# returns the fasta object proteins
|
142
|
+
def fastap(fn)
|
143
|
+
assert(File.exist?(fn), "FILE: #{fn} exists")
|
144
|
+
Fasta.new.read_file(fn).prots
|
145
|
+
end
|
146
|
+
|
147
|
+
end
|
data/test/tc_gi.rb
ADDED
@@ -0,0 +1,20 @@
|
|
1
|
+
|
2
|
+
require 'test/unit'
|
3
|
+
require 'gi'
|
4
|
+
|
5
|
+
|
6
|
+
class Gi2AnnotTest < Test::Unit::TestCase
|
7
|
+
ROOT_DIR = File.join(File.dirname(__FILE__), '..')
|
8
|
+
|
9
|
+
def test_single_query
|
10
|
+
#begin
|
11
|
+
annot = GI.gi2annot([16130548]).first
|
12
|
+
#rescue
|
13
|
+
puts "SKIPPING gi2annot test since no internet connection available:"
|
14
|
+
puts "#{$!}"
|
15
|
+
assert true
|
16
|
+
#else
|
17
|
+
assert_equal('CP4-57 prophage; RNase LS [Escherichia coli K12]'+"\n", annot)
|
18
|
+
#end
|
19
|
+
end
|
20
|
+
end
|
data/test/tc_id_class_anal.rb
CHANGED
@@ -23,14 +23,12 @@ class IDClassAnalTest < Test::Unit::TestCase
|
|
23
23
|
output = `#{@cmd} -p INV_ #{@tf_proph_inv}`
|
24
24
|
fps = [1.00, 1.00, 0.97]
|
25
25
|
tps = [1.00, 1.00, 0.98, 0.97, 0.97, 0.97, 0.97]
|
26
|
-
puts output
|
27
26
|
#File.open("tmp.csv","w") do |fh| fh.print output end
|
28
27
|
assert 1
|
29
28
|
end
|
30
29
|
|
31
|
-
def
|
30
|
+
def test_basic
|
32
31
|
output = `#{@cmd} -p INV_ #{@tf_bioworks_esmall_xml}`
|
33
|
-
# @TODO: that's the output, need to grab for consistency sake
|
34
32
|
exp = [
|
35
33
|
[1, 1.0, 0.0],
|
36
34
|
[2, 1.0, 0.0],
|
@@ -40,11 +38,11 @@ class IDClassAnalTest < Test::Unit::TestCase
|
|
40
38
|
[6, 1.0, 0.0],
|
41
39
|
[9, 1.0, 0.0],
|
42
40
|
[10, 1.0, 0.0],
|
43
|
-
[11, 0.
|
44
|
-
[12, 0.
|
45
|
-
[13, 0.
|
46
|
-
[14, 0.
|
47
|
-
[15, 0.
|
41
|
+
[11, 0.909090909090909],
|
42
|
+
[12, 0.916666666666667],
|
43
|
+
[13, 0.923076923076923],
|
44
|
+
[14, 0.928571428571429],
|
45
|
+
[15, 0.866666666666667],
|
48
46
|
]
|
49
47
|
outarr = output.split($/)
|
50
48
|
exp.each_with_index do |line,i|
|
@@ -55,18 +53,17 @@ class IDClassAnalTest < Test::Unit::TestCase
|
|
55
53
|
end
|
56
54
|
end
|
57
55
|
|
58
|
-
def
|
56
|
+
def test_multiple_output
|
59
57
|
myplot = 'class_anal.toplot'
|
60
58
|
output = `#{@cmd} -j -p INV_,SHUFF_ #{@tf_bioworks_esmall_xml} #{@tf_bioworks_shuff}`
|
61
59
|
assert(output.size > 10) ## @TODO: BETTER HERE
|
62
60
|
assert(File.exist?(myplot), "file #{myplot} exists")
|
63
61
|
File.unlink myplot
|
64
|
-
|
65
62
|
end
|
66
63
|
|
67
|
-
def
|
64
|
+
def test_jtplot_output
|
68
65
|
myplot = 'class_anal.toplot'
|
69
|
-
`#{@cmd} -p INV_ -j #{@tf_bioworks_esmall_xml}`
|
66
|
+
output = `#{@cmd} -p INV_ -j #{@tf_bioworks_esmall_xml}`
|
70
67
|
assert(File.exist?(myplot), "file #{myplot} exists")
|
71
68
|
File.unlink myplot
|
72
69
|
end
|