ms-error_rate 0.0.9 → 0.0.10
Sign up to get free protection for your applications and to get access to all the features.
- data/.autotest +14 -0
- data/.gitmodules +9 -0
- data/History +16 -0
- data/LICENSE +2 -0
- data/Rakefile +52 -0
- data/VERSION +1 -1
- data/lib/ms/error_rate/decoy.rb +27 -0
- data/lib/ms/error_rate/qvalue/mascot/percolator.rb +93 -0
- data/lib/ms/error_rate/qvalue/mascot.rb +68 -0
- data/lib/ms/error_rate/qvalue/pepxml.rb +52 -0
- data/lib/ms/error_rate/qvalue.rb +93 -0
- data/lib/ms/error_rate/sbv/peptide_based.rb +30 -0
- data/lib/ms/error_rate/sbv/protein_based.rb +39 -0
- data/lib/ms/error_rate/sbv.rb +111 -0
- data/lib/ms/error_rate.rb +9 -0
- data/lib/ms/ident.rb +125 -0
- data/lib/support/sort_by_attributes.rb +51 -0
- data/lib/transmembrane/phobius.rb +136 -0
- data/lib/transmembrane/toppred.rb +368 -0
- data/lib/transmembrane.rb +157 -0
- data/schema/peptide_hit_qvalues.pqh.tsv +5 -0
- data/script/expert_addition.rb +26 -0
- data/script/expert_list.rb +53 -0
- data/script/fasta_ipi_to_ipi_decoy.rb +23 -0
- data/script/minimal_protein_set.rb +366 -0
- data/script/unique_seq_stats.rb +72 -0
- metadata +66 -14
data/.autotest
ADDED
@@ -0,0 +1,14 @@
|
|
1
|
+
# -*- ruby -*-
|
2
|
+
|
3
|
+
Autotest.add_hook :initialize do |at|
|
4
|
+
at.clear_mappings
|
5
|
+
end
|
6
|
+
|
7
|
+
Autotest.add_hook :initialize do |at|
|
8
|
+
at.add_mapping(%r%^lib/(.*)\.rb$%) { |_, m|
|
9
|
+
#["spec/#{m[1]}_spec.rb"]
|
10
|
+
#["test/#{m[1]}_test.rb"]
|
11
|
+
## for both specs and tests:
|
12
|
+
["spec/#{m[1]}_spec.rb","test/#{m[1]}_test.rb"]
|
13
|
+
}
|
14
|
+
end
|
data/.gitmodules
ADDED
@@ -0,0 +1,9 @@
|
|
1
|
+
[submodule "submodule/ms-testdata"]
|
2
|
+
path = submodule/ms-testdata
|
3
|
+
url = git://github.com/bahuvrihi/ms-testdata.git
|
4
|
+
[submodule "submodule/ms-in_silico"]
|
5
|
+
path = submodule/ms-in_silico
|
6
|
+
url = git://github.com/bahuvrihi/ms-in_silico.git
|
7
|
+
[submodule "submodule/tap-mechanize"]
|
8
|
+
path = submodule/tap-mechanize
|
9
|
+
url = git://github.com/bahuvrihi/tap-mechanize.git
|
data/History
ADDED
@@ -0,0 +1,16 @@
|
|
1
|
+
== 0.0.6
|
2
|
+
|
3
|
+
* changed peptide centric db output to full YAML (i.e., the protein IDs are in an inline array)
|
4
|
+
|
5
|
+
== 0.0.3
|
6
|
+
|
7
|
+
* switching to ms-template-ish structure
|
8
|
+
|
9
|
+
== 0.0.2 / 2009-10-14
|
10
|
+
|
11
|
+
* basic validation with peptide and protein centric sample bias validation.
|
12
|
+
* peptide centric database created that include methionine cleavage.
|
13
|
+
|
14
|
+
== 0.0.1 / 2009-08-25
|
15
|
+
|
16
|
+
* initial work - borrowing basic structure from ms-sequest and using original mspire lib/validators work.
|
data/LICENSE
CHANGED
@@ -1,6 +1,8 @@
|
|
1
1
|
Copyright shared among contributing institutions:
|
2
2
|
Copyright (c) 2006-2008 University of Texas at Austin (the initial project)
|
3
3
|
Copyright (c) 2009 Regents of the University of Colorado and Howard Hughes Medical Institute. (modularization of the project)
|
4
|
+
Copyright (c) 2011 Brigham Young University (additions)
|
5
|
+
Authored by John T. Prince
|
4
6
|
|
5
7
|
Permission is hereby granted, free of charge, to any person obtaining a copy
|
6
8
|
of this software and associated documentation files (the "Software"), to deal
|
data/Rakefile
ADDED
@@ -0,0 +1,52 @@
|
|
1
|
+
|
2
|
+
require 'rubygems'
|
3
|
+
require 'rake'
|
4
|
+
|
5
|
+
require 'jeweler'
|
6
|
+
Jeweler::Tasks.new do |gem|
|
7
|
+
# gem is a Gem::Specification... see http://docs.rubygems.org/read/chapter/20 for more options
|
8
|
+
gem.name = "ms-error_rate"
|
9
|
+
gem.homepage = "http://github.com/jtprince/ms-error_rate"
|
10
|
+
gem.license = "MIT"
|
11
|
+
gem.summary = %Q{An mspire library for calculating error rates in MS/MS identifications (FDRs).}
|
12
|
+
gem.description = %Q{aids for creating and calculating error rates using target-decoy searches and sample validation.}
|
13
|
+
gem.email = "jtprince@gmail.com"
|
14
|
+
gem.authors = ["John Prince"]
|
15
|
+
# Include your dependencies below. Runtime dependencies are required when using your gem,
|
16
|
+
# and development dependencies are only needed for development (ie running rake tasks, tests, etc)
|
17
|
+
# gem.add_runtime_dependency 'jabber4r', '> 0.1'
|
18
|
+
# gem.add_development_dependency 'rspec', '> 1.2.3'
|
19
|
+
gem.rubyforge_project = 'mspire'
|
20
|
+
gem.add_runtime_dependency("ms-core", ">= 0.0.2")
|
21
|
+
gem.add_runtime_dependency("ms-ident", ">= 0.0.20")
|
22
|
+
gem.add_development_dependency "spec-more", ">= 0"
|
23
|
+
gem.add_development_dependency "jeweler", "~> 1.5.2"
|
24
|
+
gem.add_development_dependency "rcov", ">= 0"
|
25
|
+
end
|
26
|
+
Jeweler::RubygemsDotOrgTasks.new
|
27
|
+
|
28
|
+
require 'rake/testtask'
|
29
|
+
Rake::TestTask.new(:spec) do |spec|
|
30
|
+
spec.libs << 'lib' << 'spec'
|
31
|
+
spec.pattern = 'spec/**/*_spec.rb'
|
32
|
+
spec.verbose = true
|
33
|
+
end
|
34
|
+
|
35
|
+
#require 'rcov/rcovtask'
|
36
|
+
#Rcov::RcovTask.new do |spec|
|
37
|
+
# spec.libs << 'spec'
|
38
|
+
# spec.pattern = 'spec/**/*_spec.rb'
|
39
|
+
# spec.verbose = true
|
40
|
+
#end
|
41
|
+
|
42
|
+
task :default => :spec
|
43
|
+
|
44
|
+
require 'rake/rdoctask'
|
45
|
+
Rake::RDocTask.new do |rdoc|
|
46
|
+
version = File.exist?('VERSION') ? File.read('VERSION') : ""
|
47
|
+
|
48
|
+
rdoc.rdoc_dir = 'rdoc'
|
49
|
+
rdoc.title = "ms-error_rate #{version}"
|
50
|
+
rdoc.rdoc_files.include('README*')
|
51
|
+
rdoc.rdoc_files.include('lib/**/*.rb')
|
52
|
+
end
|
data/VERSION
CHANGED
@@ -1 +1 @@
|
|
1
|
-
0.0.
|
1
|
+
0.0.10
|
@@ -0,0 +1,27 @@
|
|
1
|
+
|
2
|
+
module Ms
|
3
|
+
module ErrorRate
|
4
|
+
module Decoy
|
5
|
+
# this is the # true positives (found by estimating the number of false
|
6
|
+
# hits using the # decoy)
|
7
|
+
# frit == fraction
|
8
|
+
def self.precision(num_target, num_decoy, frit=1.0)
|
9
|
+
# will calculate as floats in case fractional amounts passed in for
|
10
|
+
# whatever reason
|
11
|
+
num_target_f = num_target.to_f
|
12
|
+
num_true_pos = num_target_f - (num_decoy.to_f * frit)
|
13
|
+
precision =
|
14
|
+
if num_target_f == 0.0
|
15
|
+
if num_decoy.to_f > 0.0
|
16
|
+
0.0
|
17
|
+
else
|
18
|
+
1.0
|
19
|
+
end
|
20
|
+
else
|
21
|
+
num_true_pos/num_target_f
|
22
|
+
end
|
23
|
+
precision
|
24
|
+
end
|
25
|
+
end
|
26
|
+
end
|
27
|
+
end
|
@@ -0,0 +1,93 @@
|
|
1
|
+
|
2
|
+
require 'ms/mascot/dat'
|
3
|
+
require 'ms/error_rate/qvalue'
|
4
|
+
require 'ms/error_rate/qvalue/mascot'
|
5
|
+
|
6
|
+
module Ms
|
7
|
+
module ErrorRate
|
8
|
+
module Qvalue
|
9
|
+
module Mascot
|
10
|
+
module Percolator
|
11
|
+
|
12
|
+
module_function
|
13
|
+
# returns an array of Structs where the keys are the first line
|
14
|
+
# everything is cast properly
|
15
|
+
# three additional keys are available query_num, rank, sequence
|
16
|
+
# sequence is the amino acid sequence without the surrounding X's
|
17
|
+
# and dots.
|
18
|
+
# (with '-' substituted for '_')
|
19
|
+
def tab_txt(file)
|
20
|
+
hits = []
|
21
|
+
File.open(file) do |io|
|
22
|
+
# PSMId score q-value posterior_error_prob peptide proteinIds
|
23
|
+
atts = io.gets.chomp.split("\t").map {|v| v.gsub('-', '_').to_sym }
|
24
|
+
atts.push(:query_num, :rank, :sequence)
|
25
|
+
struct_class = Struct.new("Hit", *atts)
|
26
|
+
|
27
|
+
io.each do |line|
|
28
|
+
(query_rank, score, qvalue, perrp, peptide, *prots ) = line.chomp.split("\t")
|
29
|
+
(query, rank) = query_rank.split(';').map {|v| v.split(':').last.to_i }
|
30
|
+
|
31
|
+
hits << struct_class.new(query_rank, score.to_f, qvalue.to_f, perrp.to_f, peptide, prots, query, rank, peptide.split('.')[1])
|
32
|
+
end
|
33
|
+
end
|
34
|
+
hits
|
35
|
+
end
|
36
|
+
|
37
|
+
end
|
38
|
+
end
|
39
|
+
end
|
40
|
+
end
|
41
|
+
end
|
42
|
+
|
43
|
+
module Ms::ErrorRate::Qvalue::Mascot::Percolator
|
44
|
+
|
45
|
+
module_function
|
46
|
+
# returns an array of Structs of PeptideHit(:filename, :query_title, :charge, :sequence, :mowse, :qvalue)
|
47
|
+
# opts =
|
48
|
+
# :min_peptide_length => Integer
|
49
|
+
def qvalues(datp_files, tab_txt_files, opts={})
|
50
|
+
min_pep_len = opts[:min_peptide_length]
|
51
|
+
|
52
|
+
# we only want the top hit per query title (which should ensure that we
|
53
|
+
# get the top hit per scan)
|
54
|
+
hits_by_query_title = Hash.new {|h,k| h[k] = [] }
|
55
|
+
datp_files.zip(tab_txt_files) do |datp_file, tab_txt_file|
|
56
|
+
# build a hash based on the sequence
|
57
|
+
structs = Ms::ErrorRate::Qvalue::Mascot::Percolator.tab_txt( tab_txt_file )
|
58
|
+
qvalue_by_query_rank = {}
|
59
|
+
structs.each do |struct|
|
60
|
+
qvalue_by_query_rank[[struct.query_num, struct.rank]] = struct.q_value
|
61
|
+
end
|
62
|
+
|
63
|
+
base_no_ext = File.basename(datp_file, '.*')
|
64
|
+
Ms::Mascot::Dat.open(datp_file) do |dat|
|
65
|
+
dat.each_peptide_hit(:by => :groups, :yield_nil => false, :with_query => true) do |hits,query|
|
66
|
+
hits.each do |hit|
|
67
|
+
if qval = qvalue_by_query_rank[[hit.query_num, hit.hit_num]]
|
68
|
+
hit_as_struct = Ms::ErrorRate::Qvalue::Mascot::MascotPeptideHit.new(base_no_ext, query.title, query.charge, hit.sequence, hit.score, qval)
|
69
|
+
hits_by_query_title[hit_as_struct.query_title] << hit_as_struct
|
70
|
+
end
|
71
|
+
end
|
72
|
+
end
|
73
|
+
end
|
74
|
+
end
|
75
|
+
|
76
|
+
final_hits = []
|
77
|
+
hits_by_query_title.each do |title, hits|
|
78
|
+
best_hit =
|
79
|
+
if hits.size == 1
|
80
|
+
hits.first
|
81
|
+
else
|
82
|
+
hits.sort_by(&:mowse).last
|
83
|
+
end
|
84
|
+
# FILTER HERE:
|
85
|
+
# ONLY TAKE the BEST HIT IF it passes any filters
|
86
|
+
if min_pep_len
|
87
|
+
next unless best_hit.sequence.size >= min_pep_len
|
88
|
+
end
|
89
|
+
final_hits << best_hit
|
90
|
+
end
|
91
|
+
final_hits
|
92
|
+
end
|
93
|
+
end
|
@@ -0,0 +1,68 @@
|
|
1
|
+
require 'ms/error_rate/qvalue'
|
2
|
+
require 'ms/mascot/dat'
|
3
|
+
|
4
|
+
module Ms
|
5
|
+
module ErrorRate
|
6
|
+
module Qvalue
|
7
|
+
module Mascot
|
8
|
+
end
|
9
|
+
end
|
10
|
+
end
|
11
|
+
end
|
12
|
+
|
13
|
+
|
14
|
+
module Ms::ErrorRate::Qvalue::Mascot
|
15
|
+
MEMBERS = [:filename, :query_title, :charge, :sequence, :mowse, :qvalue]
|
16
|
+
MascotPeptideHit = Struct.new(*MEMBERS) do
|
17
|
+
# emits an array rather than a Struct object
|
18
|
+
def to_yaml(*args)
|
19
|
+
to_a.to_yaml(*args)
|
20
|
+
end
|
21
|
+
end
|
22
|
+
|
23
|
+
module_function
|
24
|
+
# returns an array of Structs of PeptideHit(:filename, :query_title, :charge, :sequence, :mowse, :qvalue)
|
25
|
+
# opts =
|
26
|
+
# :min_peptide_length => Integer
|
27
|
+
def qvalues(target_files, decoy_files, opts={})
|
28
|
+
min_pep_len = opts[:min_peptide_length]
|
29
|
+
|
30
|
+
# we only want the top hit per query title (which should ensure that we
|
31
|
+
# get the top hit per scan)
|
32
|
+
(target_hits, decoy_hits) = [target_files, decoy_files].map do |files|
|
33
|
+
hits_by_query_title = Hash.new {|h,k| h[k] = [] }
|
34
|
+
files.each do |file|
|
35
|
+
base_no_ext = File.basename(file, '.*')
|
36
|
+
Ms::Mascot::Dat.open(file) do |dat|
|
37
|
+
dat.each_peptide_hit(:by => :top, :yield_nil => false, :with_query => true) do |hit,query|
|
38
|
+
|
39
|
+
hit_as_struct = MascotPeptideHit.new(base_no_ext, query.title, query.charge, hit.sequence, hit.score)
|
40
|
+
hits_by_query_title[hit_as_struct.query_title] << hit_as_struct
|
41
|
+
end
|
42
|
+
end
|
43
|
+
end
|
44
|
+
|
45
|
+
final_hits = []
|
46
|
+
hits_by_query_title.each do |title, hits|
|
47
|
+
best_hit =
|
48
|
+
if hits.size == 1
|
49
|
+
hits.first
|
50
|
+
else
|
51
|
+
hits.sort_by(&:mowse).last
|
52
|
+
end
|
53
|
+
# FILTER HERE:
|
54
|
+
# ONLY TAKE the BEST HIT IF it passes any filters
|
55
|
+
if min_pep_len
|
56
|
+
next unless best_hit.sequence.size >= min_pep_len
|
57
|
+
end
|
58
|
+
final_hits << best_hit
|
59
|
+
end
|
60
|
+
final_hits
|
61
|
+
end
|
62
|
+
pairs = Ms::ErrorRate::Qvalue.target_decoy_qvalues(target_hits, decoy_hits, opts, &:mowse)
|
63
|
+
pairs.map do |hit, qval|
|
64
|
+
hit.qvalue = qval
|
65
|
+
hit
|
66
|
+
end
|
67
|
+
end
|
68
|
+
end
|
@@ -0,0 +1,52 @@
|
|
1
|
+
require 'ms/error_rate/qvalue'
|
2
|
+
|
3
|
+
module Ms ; end
|
4
|
+
module Ms::ErrorRate ; end
|
5
|
+
module Ms::ErrorRate::Qvalue ; end
|
6
|
+
|
7
|
+
module Ms::ErrorRate::Qvalue::Pepxml
|
8
|
+
module_function
|
9
|
+
|
10
|
+
# returns an array of hit and qvalue pairs
|
11
|
+
# retrieves the aaseq, charge, and all search_score keys and values for use
|
12
|
+
# in the search_hit. caller must provide a sort_by block, where the best
|
13
|
+
# hits are last. charge is an integer, and all other search scores are cast
|
14
|
+
# as floats. returns the output filename.
|
15
|
+
def target_decoy_qvalues(target_pepxml, decoy_pepxml, opt={}, &sort_by)
|
16
|
+
|
17
|
+
# this is a list of high quality peptide hits associated with each group
|
18
|
+
fields = [:aaseq, :charge]
|
19
|
+
ss_names = []
|
20
|
+
have_ss_names = false
|
21
|
+
(target_hits, decoy_hits) = [target_pepxml, decoy_pepxml].map do |file|
|
22
|
+
# begin with aaseq, charge
|
23
|
+
File.open(file) do |io|
|
24
|
+
doc = Nokogiri::XML.parse(io, nil, nil, Nokogiri::XML::ParseOptions::DEFAULT_XML | Nokogiri::XML::ParseOptions::NOBLANKS)
|
25
|
+
# we can work with namespaces, or just remove them ...
|
26
|
+
doc.remove_namespaces!
|
27
|
+
root = doc.root
|
28
|
+
search_hits = root.xpath('//search_hit')
|
29
|
+
search_hits.map do |search_hit|
|
30
|
+
aaseq = search_hit['peptide']
|
31
|
+
charge = search_hit.parent.parent['assumed_charge'].to_i
|
32
|
+
search_score_nodes = search_hit.children.select {|node| node.name == 'search_score' }
|
33
|
+
ss_values = []
|
34
|
+
search_score_nodes.each do |node|
|
35
|
+
ss_names << node['name'].to_sym unless have_ss_names
|
36
|
+
ss_values << node['value'].to_f
|
37
|
+
end
|
38
|
+
have_ss_names = true
|
39
|
+
[aaseq, charge, *ss_values]
|
40
|
+
end
|
41
|
+
end
|
42
|
+
end
|
43
|
+
|
44
|
+
fields.push(*ss_names)
|
45
|
+
|
46
|
+
peptide_hit_class = Struct.new(*fields)
|
47
|
+
(t_hits, d_hits) = [target_hits, decoy_hits].map {|hits| hits.map {|hit_values| peptide_hit_class.new(*hit_values) } }
|
48
|
+
|
49
|
+
# hit and qvalue pairs
|
50
|
+
Ms::ErrorRate::Qvalue.target_decoy_qvalues(t_hits, d_hits, :z_together => opt[:z_together], &sort_by)
|
51
|
+
end
|
52
|
+
end
|
@@ -0,0 +1,93 @@
|
|
1
|
+
|
2
|
+
require 'set'
|
3
|
+
require 'ms/error_rate/decoy'
|
4
|
+
|
5
|
+
|
6
|
+
class Array
|
7
|
+
def group_by(&block)
|
8
|
+
hash = Hash.new {|h,k| h[k] = [] }
|
9
|
+
each do |v|
|
10
|
+
hash[block.call(v)] << v
|
11
|
+
end
|
12
|
+
hash
|
13
|
+
end unless [].respond_to?(:group_by)
|
14
|
+
end
|
15
|
+
|
16
|
+
module Ms
|
17
|
+
|
18
|
+
module ErrorRate
|
19
|
+
# For generating and working with q-value calculations. The q-value is the global false discovery rate when accepting that particular ID. We do not necessarily distinguish here between *how* the FDR is generated (i.e., Storey's pFDR "the occurrence of false positives" vs. Benjamini-Hochberg's FDR "the rate of false positives" [except to prefer Storey when possible] ). The main point is that we sort and threshold based on a global FDR.
|
20
|
+
module Qvalue
|
21
|
+
module_function
|
22
|
+
|
23
|
+
# returns a parallel array to target hits with qvalues
|
24
|
+
# opts = :z_together true/false (default false) group all charges
|
25
|
+
# together.
|
26
|
+
# the sort block should sort from worst to best
|
27
|
+
# by default, sorting is: {|hit| hit.score} if not provided
|
28
|
+
# options also passed through to mixed_target_decoy
|
29
|
+
def target_decoy_qvalues(target_hits, decoy_hits, opts={}, &sorting)
|
30
|
+
sorting ||= :score
|
31
|
+
opts = {:z_together => false}.merge(opts)
|
32
|
+
target_set = Set.new(target_hits)
|
33
|
+
|
34
|
+
# Proc.new doesn't do arity checking
|
35
|
+
hit_with_qvalue_pairs = Proc.new do |hits|
|
36
|
+
sorted_best_to_worst = (hits.sort_by(&sorting)).reverse
|
37
|
+
(target_hits, qvalues) = Ms::ErrorRate::Qvalue.mixed_target_decoy(sorted_best_to_worst, target_set, opts)
|
38
|
+
target_hits.zip(qvalues)
|
39
|
+
end
|
40
|
+
|
41
|
+
all_together = target_hits + decoy_hits
|
42
|
+
if !opts[:z_together]
|
43
|
+
hit_with_qvalue_pairs.call(all_together)
|
44
|
+
else
|
45
|
+
all_hits = []
|
46
|
+
by_charge = all_together.group_by(&:charge)
|
47
|
+
by_charge.each do |charge,hits|
|
48
|
+
all_hits.push(*(hit_with_qvalue_pairs.call(hits)))
|
49
|
+
end
|
50
|
+
all_hits
|
51
|
+
end
|
52
|
+
end
|
53
|
+
|
54
|
+
# returns [target_hits, qvalues] (parallel arrays sorted from best hit to
|
55
|
+
# worst hit). expects an array-like object of hits sorted from best to worst
|
56
|
+
# hit with decoys interspersed and a target_setlike object that responds to
|
57
|
+
# :include? for the hit object assumes the hit is a decoy if not found
|
58
|
+
# in the target set! if monotonic is false, then the guarantee that
|
59
|
+
# qvalues be monotonically increasing is not respected.
|
60
|
+
def mixed_target_decoy(best_to_worst, target_setlike, opts={})
|
61
|
+
opts = {:monotonic => true}.merge(opts)
|
62
|
+
num_target = 0 ; num_decoy = 0
|
63
|
+
monotonic = opts[:monotonic]
|
64
|
+
target_hits = []
|
65
|
+
qvalues = []
|
66
|
+
best_to_worst.each do |hit|
|
67
|
+
if target_setlike.include?(hit)
|
68
|
+
num_target += 1
|
69
|
+
precision = Ms::ErrorRate::Decoy.precision(num_target, num_decoy)
|
70
|
+
target_hits << hit
|
71
|
+
qvalues << (1.0 - precision)
|
72
|
+
else
|
73
|
+
num_decoy += 1
|
74
|
+
end
|
75
|
+
end
|
76
|
+
if opts[:monotonic]
|
77
|
+
min_qvalue = qvalues.last
|
78
|
+
qvalues = qvalues.reverse.map do |val| # from worst to best score
|
79
|
+
if min_qvalue < val
|
80
|
+
min_qvalue
|
81
|
+
else
|
82
|
+
min_qvalue = val
|
83
|
+
val
|
84
|
+
end
|
85
|
+
end.reverse
|
86
|
+
end
|
87
|
+
[target_hits, qvalues]
|
88
|
+
end
|
89
|
+
|
90
|
+
|
91
|
+
end
|
92
|
+
end
|
93
|
+
end
|
@@ -0,0 +1,30 @@
|
|
1
|
+
require 'ms/error_rate/sbv'
|
2
|
+
|
3
|
+
module Ms
|
4
|
+
module ErrorRate
|
5
|
+
class Sbv
|
6
|
+
# Constraints on aaseq attribute of peptides (the bare amino acid sequence)
|
7
|
+
# works by calculating amino acid frequencies in the fasta file used.
|
8
|
+
class PeptideBased
|
9
|
+
|
10
|
+
def self.generate_hashes(pep_to_prot_file, aa="C", min_num=1 )
|
11
|
+
Ms::ErrorRate::Sbv.generate_hashes(pep_to_prot_file, :type_code => "aa_min#{min_num}") do |pep|
|
12
|
+
if min_num == 1
|
13
|
+
if pep.include?(aa) ; 1
|
14
|
+
else ; 0
|
15
|
+
end
|
16
|
+
else
|
17
|
+
count = 0
|
18
|
+
pep.each_char {|c| count += 1 if c == aa }
|
19
|
+
if count >= min_num ; 1
|
20
|
+
else ; 0
|
21
|
+
end
|
22
|
+
end
|
23
|
+
end
|
24
|
+
end
|
25
|
+
|
26
|
+
end # class
|
27
|
+
end # Sbv
|
28
|
+
end # ER
|
29
|
+
end # Ms
|
30
|
+
|
@@ -0,0 +1,39 @@
|
|
1
|
+
require 'ms/fasta'
|
2
|
+
require 'ms/error_rate/sbv'
|
3
|
+
require 'transmembrane'
|
4
|
+
|
5
|
+
module Ms
|
6
|
+
module ErrorRate
|
7
|
+
class Sbv
|
8
|
+
module ProteinBased
|
9
|
+
DEFAULT_NO_PROTS_VAL = 0.0
|
10
|
+
# note the pep to prot hash has proteins in a string separated by a
|
11
|
+
# hyphen. returns the names of the files written
|
12
|
+
def self.generate_hashes(pep_to_prot_file, protid_to_val, options={})
|
13
|
+
options[:protein_hash] = protid_to_val
|
14
|
+
options[:type_code] = 'tm' unless options[:type_code]
|
15
|
+
files = Ms::ErrorRate::Sbv.generate_hashes(pep_to_prot_file, options) do |prot_return_vals|
|
16
|
+
|
17
|
+
total_with_bias = 0
|
18
|
+
total_known = 0
|
19
|
+
prot_return_vals.each do |val|
|
20
|
+
if !val.nil?
|
21
|
+
total_with_bias += val
|
22
|
+
total_known += 1
|
23
|
+
end
|
24
|
+
end
|
25
|
+
if total_known == 0
|
26
|
+
DEFAULT_NO_PROTS_VAL
|
27
|
+
else
|
28
|
+
total_with_bias.to_f / total_known
|
29
|
+
end
|
30
|
+
end #block
|
31
|
+
|
32
|
+
files
|
33
|
+
|
34
|
+
end # end method
|
35
|
+
end # module
|
36
|
+
end # class
|
37
|
+
end # ErrorRate
|
38
|
+
end # Ms
|
39
|
+
|
@@ -0,0 +1,111 @@
|
|
1
|
+
|
2
|
+
|
3
|
+
module Ms
|
4
|
+
module ErrorRate
|
5
|
+
# Sample Bias Validator
|
6
|
+
class Sbv
|
7
|
+
LENGTH_EXT = 'freq_by_length'
|
8
|
+
AASEQ_EXT = 'by_aaseq'
|
9
|
+
|
10
|
+
# if a protein hash is given, will yield the return an array of
|
11
|
+
# values generated with the value from keying each protein of the
|
12
|
+
# peptide. Otherwise, will yield each peptide in turn
|
13
|
+
def self.generate_hashes(pep_to_prot_file, opts={})
|
14
|
+
op = { :aaseq_ext => AASEQ_EXT,
|
15
|
+
:length_ext => LENGTH_EXT,
|
16
|
+
:file_ext => '.yml',
|
17
|
+
:type_code => '',
|
18
|
+
:protein_hash => nil,
|
19
|
+
:stderr_counter => true,
|
20
|
+
}.merge(opts)
|
21
|
+
|
22
|
+
base = pep_to_prot_file.chomp(File.extname(pep_to_prot_file))
|
23
|
+
freqs = Hash.new {|h,k| h[k] = 0.0 }
|
24
|
+
counts = Hash.new {|h,k| h[k] = 0 }
|
25
|
+
(fileout1, fileout2) = [:aaseq_ext, :length_ext].map do |type_ext|
|
26
|
+
base + '.' + op[:type_code] + '.' + op[type_ext] + op[:file_ext]
|
27
|
+
end
|
28
|
+
protein_hash = op[:protein_hash]
|
29
|
+
pep_count = 0
|
30
|
+
if op[:stderr_counter]
|
31
|
+
$stderr.print "[working, 100,000 peptides = '.'] "
|
32
|
+
$stderr.flush
|
33
|
+
end
|
34
|
+
File.open(fileout1 , 'w') do |out|
|
35
|
+
IO.foreach(pep_to_prot_file) do |line|
|
36
|
+
(pep, prot_string) = line.chomp!.split(': ')
|
37
|
+
|
38
|
+
total_transmembrane = 0
|
39
|
+
total_known = 0
|
40
|
+
answ =
|
41
|
+
if protein_hash
|
42
|
+
yield( protein_hash.values_at(*(prot_string.split('-'))) )
|
43
|
+
else
|
44
|
+
yield(pep)
|
45
|
+
end
|
46
|
+
out.puts "#{pep}: #{answ}"
|
47
|
+
freqs[pep.size] += answ
|
48
|
+
counts[pep.size] += 1
|
49
|
+
pep_count += 1
|
50
|
+
if pep_count % 100000 == 0 && op[:stderr_counter]
|
51
|
+
$stderr.print '.'
|
52
|
+
$stderr.flush
|
53
|
+
end
|
54
|
+
end
|
55
|
+
end
|
56
|
+
$stderr.puts "DONE!" if op[:stderr_counter]
|
57
|
+
avg_freq_ar = {}
|
58
|
+
freqs.each do |k,v|
|
59
|
+
avg_freq_ar[k] = v / counts[k]
|
60
|
+
end
|
61
|
+
File.open(fileout2, 'w') {|out| out.print avg_freq_ar.to_yaml }
|
62
|
+
[fileout1, fileout2]
|
63
|
+
end
|
64
|
+
|
65
|
+
|
66
|
+
# a hash by aaseq giving a value between 0 and 1 telling how much of
|
67
|
+
# an indicator the hit is
|
68
|
+
attr_accessor :indicator_by_aaseq
|
69
|
+
|
70
|
+
attr_accessor :frequency_indicator_opposite
|
71
|
+
|
72
|
+
attr_accessor :size_to_freq
|
73
|
+
|
74
|
+
# boolean
|
75
|
+
attr_accessor :indicators_signify_true_hit
|
76
|
+
|
77
|
+
|
78
|
+
def initialize(indicator_by_aaseq_hash, size_to_freq, frequency_indicator_opposite, indicators_signify_true_hit=true)
|
79
|
+
@indicators_signify_true_hit = indicators_signify_true_hit
|
80
|
+
@frequency_indicator_opposite = frequency_indicator_opposite
|
81
|
+
@indicator_by_aaseq = indicator_by_aaseq_hash
|
82
|
+
@tot_num_indicators = 0.0
|
83
|
+
@tot_num = 0
|
84
|
+
end
|
85
|
+
|
86
|
+
# returns the cumulative precision (fraction of true positives among
|
87
|
+
# total hits) frequency_of_indicators is the probability that a generic
|
88
|
+
# amino acid sequence will be an indicator (this may variable by
|
89
|
+
# sequence length).
|
90
|
+
def update_precision(aaseq)
|
91
|
+
@tot_num_indicators << indicator_by_aaseq[aaseq]
|
92
|
+
@tot_num += 1
|
93
|
+
@frequency_of_indicators_sum += @size_to_freq[aaseq.size]
|
94
|
+
# FP Indicator
|
95
|
+
value = @tot_num_indicators * (1.0 - @frequency_indicator_opposite) * @frequency_of_indicators_sum / (@tot_num**2)
|
96
|
+
precision =
|
97
|
+
if @indicators_signify_true_hit
|
98
|
+
value # a true indicator type (gives precision)
|
99
|
+
else # false indicator type
|
100
|
+
1 - value # 1 - fdr == precision
|
101
|
+
end
|
102
|
+
end
|
103
|
+
|
104
|
+
def calculate_background_frequency
|
105
|
+
@aaseq_to_fraction
|
106
|
+
end
|
107
|
+
|
108
|
+
end
|
109
|
+
|
110
|
+
end
|
111
|
+
end
|