ms-error_rate 0.0.9 → 0.0.10
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/.autotest +14 -0
- data/.gitmodules +9 -0
- data/History +16 -0
- data/LICENSE +2 -0
- data/Rakefile +52 -0
- data/VERSION +1 -1
- data/lib/ms/error_rate/decoy.rb +27 -0
- data/lib/ms/error_rate/qvalue/mascot/percolator.rb +93 -0
- data/lib/ms/error_rate/qvalue/mascot.rb +68 -0
- data/lib/ms/error_rate/qvalue/pepxml.rb +52 -0
- data/lib/ms/error_rate/qvalue.rb +93 -0
- data/lib/ms/error_rate/sbv/peptide_based.rb +30 -0
- data/lib/ms/error_rate/sbv/protein_based.rb +39 -0
- data/lib/ms/error_rate/sbv.rb +111 -0
- data/lib/ms/error_rate.rb +9 -0
- data/lib/ms/ident.rb +125 -0
- data/lib/support/sort_by_attributes.rb +51 -0
- data/lib/transmembrane/phobius.rb +136 -0
- data/lib/transmembrane/toppred.rb +368 -0
- data/lib/transmembrane.rb +157 -0
- data/schema/peptide_hit_qvalues.pqh.tsv +5 -0
- data/script/expert_addition.rb +26 -0
- data/script/expert_list.rb +53 -0
- data/script/fasta_ipi_to_ipi_decoy.rb +23 -0
- data/script/minimal_protein_set.rb +366 -0
- data/script/unique_seq_stats.rb +72 -0
- metadata +66 -14
data/.autotest
ADDED
@@ -0,0 +1,14 @@
|
|
1
|
+
# -*- ruby -*-
|
2
|
+
|
3
|
+
Autotest.add_hook :initialize do |at|
|
4
|
+
at.clear_mappings
|
5
|
+
end
|
6
|
+
|
7
|
+
Autotest.add_hook :initialize do |at|
|
8
|
+
at.add_mapping(%r%^lib/(.*)\.rb$%) { |_, m|
|
9
|
+
#["spec/#{m[1]}_spec.rb"]
|
10
|
+
#["test/#{m[1]}_test.rb"]
|
11
|
+
## for both specs and tests:
|
12
|
+
["spec/#{m[1]}_spec.rb","test/#{m[1]}_test.rb"]
|
13
|
+
}
|
14
|
+
end
|
data/.gitmodules
ADDED
@@ -0,0 +1,9 @@
|
|
1
|
+
[submodule "submodule/ms-testdata"]
|
2
|
+
path = submodule/ms-testdata
|
3
|
+
url = git://github.com/bahuvrihi/ms-testdata.git
|
4
|
+
[submodule "submodule/ms-in_silico"]
|
5
|
+
path = submodule/ms-in_silico
|
6
|
+
url = git://github.com/bahuvrihi/ms-in_silico.git
|
7
|
+
[submodule "submodule/tap-mechanize"]
|
8
|
+
path = submodule/tap-mechanize
|
9
|
+
url = git://github.com/bahuvrihi/tap-mechanize.git
|
data/History
ADDED
@@ -0,0 +1,16 @@
|
|
1
|
+
== 0.0.6
|
2
|
+
|
3
|
+
* changed peptide centric db output to full YAML (i.e., the protein IDs are in an inline array)
|
4
|
+
|
5
|
+
== 0.0.3
|
6
|
+
|
7
|
+
* switching to ms-template-ish structure
|
8
|
+
|
9
|
+
== 0.0.2 / 2009-10-14
|
10
|
+
|
11
|
+
* basic validation with peptide and protein centric sample bias validation.
|
12
|
+
* peptide centric database created that include methionine cleavage.
|
13
|
+
|
14
|
+
== 0.0.1 / 2009-08-25
|
15
|
+
|
16
|
+
* initial work - borrowing basic structure from ms-sequest and using original mspire lib/validators work.
|
data/LICENSE
CHANGED
@@ -1,6 +1,8 @@
|
|
1
1
|
Copyright shared among contributing institutions:
|
2
2
|
Copyright (c) 2006-2008 University of Texas at Austin (the initial project)
|
3
3
|
Copyright (c) 2009 Regents of the University of Colorado and Howard Hughes Medical Institute. (modularization of the project)
|
4
|
+
Copyright (c) 2011 Brigham Young University (additions)
|
5
|
+
Authored by John T. Prince
|
4
6
|
|
5
7
|
Permission is hereby granted, free of charge, to any person obtaining a copy
|
6
8
|
of this software and associated documentation files (the "Software"), to deal
|
data/Rakefile
ADDED
@@ -0,0 +1,52 @@
|
|
1
|
+
|
2
|
+
require 'rubygems'
|
3
|
+
require 'rake'
|
4
|
+
|
5
|
+
require 'jeweler'
|
6
|
+
Jeweler::Tasks.new do |gem|
|
7
|
+
# gem is a Gem::Specification... see http://docs.rubygems.org/read/chapter/20 for more options
|
8
|
+
gem.name = "ms-error_rate"
|
9
|
+
gem.homepage = "http://github.com/jtprince/ms-error_rate"
|
10
|
+
gem.license = "MIT"
|
11
|
+
gem.summary = %Q{An mspire library for calculating error rates in MS/MS identifications (FDRs).}
|
12
|
+
gem.description = %Q{aids for creating and calculating error rates using target-decoy searches and sample validation.}
|
13
|
+
gem.email = "jtprince@gmail.com"
|
14
|
+
gem.authors = ["John Prince"]
|
15
|
+
# Include your dependencies below. Runtime dependencies are required when using your gem,
|
16
|
+
# and development dependencies are only needed for development (ie running rake tasks, tests, etc)
|
17
|
+
# gem.add_runtime_dependency 'jabber4r', '> 0.1'
|
18
|
+
# gem.add_development_dependency 'rspec', '> 1.2.3'
|
19
|
+
gem.rubyforge_project = 'mspire'
|
20
|
+
gem.add_runtime_dependency("ms-core", ">= 0.0.2")
|
21
|
+
gem.add_runtime_dependency("ms-ident", ">= 0.0.20")
|
22
|
+
gem.add_development_dependency "spec-more", ">= 0"
|
23
|
+
gem.add_development_dependency "jeweler", "~> 1.5.2"
|
24
|
+
gem.add_development_dependency "rcov", ">= 0"
|
25
|
+
end
|
26
|
+
Jeweler::RubygemsDotOrgTasks.new
|
27
|
+
|
28
|
+
require 'rake/testtask'
|
29
|
+
Rake::TestTask.new(:spec) do |spec|
|
30
|
+
spec.libs << 'lib' << 'spec'
|
31
|
+
spec.pattern = 'spec/**/*_spec.rb'
|
32
|
+
spec.verbose = true
|
33
|
+
end
|
34
|
+
|
35
|
+
#require 'rcov/rcovtask'
|
36
|
+
#Rcov::RcovTask.new do |spec|
|
37
|
+
# spec.libs << 'spec'
|
38
|
+
# spec.pattern = 'spec/**/*_spec.rb'
|
39
|
+
# spec.verbose = true
|
40
|
+
#end
|
41
|
+
|
42
|
+
task :default => :spec
|
43
|
+
|
44
|
+
require 'rake/rdoctask'
|
45
|
+
Rake::RDocTask.new do |rdoc|
|
46
|
+
version = File.exist?('VERSION') ? File.read('VERSION') : ""
|
47
|
+
|
48
|
+
rdoc.rdoc_dir = 'rdoc'
|
49
|
+
rdoc.title = "ms-error_rate #{version}"
|
50
|
+
rdoc.rdoc_files.include('README*')
|
51
|
+
rdoc.rdoc_files.include('lib/**/*.rb')
|
52
|
+
end
|
data/VERSION
CHANGED
@@ -1 +1 @@
|
|
1
|
-
0.0.
|
1
|
+
0.0.10
|
@@ -0,0 +1,27 @@
|
|
1
|
+
|
2
|
+
module Ms
|
3
|
+
module ErrorRate
|
4
|
+
module Decoy
|
5
|
+
# this is the # true positives (found by estimating the number of false
|
6
|
+
# hits using the # decoy)
|
7
|
+
# frit == fraction
|
8
|
+
def self.precision(num_target, num_decoy, frit=1.0)
|
9
|
+
# will calculate as floats in case fractional amounts passed in for
|
10
|
+
# whatever reason
|
11
|
+
num_target_f = num_target.to_f
|
12
|
+
num_true_pos = num_target_f - (num_decoy.to_f * frit)
|
13
|
+
precision =
|
14
|
+
if num_target_f == 0.0
|
15
|
+
if num_decoy.to_f > 0.0
|
16
|
+
0.0
|
17
|
+
else
|
18
|
+
1.0
|
19
|
+
end
|
20
|
+
else
|
21
|
+
num_true_pos/num_target_f
|
22
|
+
end
|
23
|
+
precision
|
24
|
+
end
|
25
|
+
end
|
26
|
+
end
|
27
|
+
end
|
@@ -0,0 +1,93 @@
|
|
1
|
+
|
2
|
+
require 'ms/mascot/dat'
|
3
|
+
require 'ms/error_rate/qvalue'
|
4
|
+
require 'ms/error_rate/qvalue/mascot'
|
5
|
+
|
6
|
+
module Ms
|
7
|
+
module ErrorRate
|
8
|
+
module Qvalue
|
9
|
+
module Mascot
|
10
|
+
module Percolator
|
11
|
+
|
12
|
+
module_function
|
13
|
+
# returns an array of Structs where the keys are the first line
|
14
|
+
# everything is cast properly
|
15
|
+
# three additional keys are available query_num, rank, sequence
|
16
|
+
# sequence is the amino acid sequence without the surrounding X's
|
17
|
+
# and dots.
|
18
|
+
# (with '-' substituted for '_')
|
19
|
+
def tab_txt(file)
|
20
|
+
hits = []
|
21
|
+
File.open(file) do |io|
|
22
|
+
# PSMId score q-value posterior_error_prob peptide proteinIds
|
23
|
+
atts = io.gets.chomp.split("\t").map {|v| v.gsub('-', '_').to_sym }
|
24
|
+
atts.push(:query_num, :rank, :sequence)
|
25
|
+
struct_class = Struct.new("Hit", *atts)
|
26
|
+
|
27
|
+
io.each do |line|
|
28
|
+
(query_rank, score, qvalue, perrp, peptide, *prots ) = line.chomp.split("\t")
|
29
|
+
(query, rank) = query_rank.split(';').map {|v| v.split(':').last.to_i }
|
30
|
+
|
31
|
+
hits << struct_class.new(query_rank, score.to_f, qvalue.to_f, perrp.to_f, peptide, prots, query, rank, peptide.split('.')[1])
|
32
|
+
end
|
33
|
+
end
|
34
|
+
hits
|
35
|
+
end
|
36
|
+
|
37
|
+
end
|
38
|
+
end
|
39
|
+
end
|
40
|
+
end
|
41
|
+
end
|
42
|
+
|
43
|
+
module Ms::ErrorRate::Qvalue::Mascot::Percolator
|
44
|
+
|
45
|
+
module_function
|
46
|
+
# returns an array of Structs of PeptideHit(:filename, :query_title, :charge, :sequence, :mowse, :qvalue)
|
47
|
+
# opts =
|
48
|
+
# :min_peptide_length => Integer
|
49
|
+
def qvalues(datp_files, tab_txt_files, opts={})
|
50
|
+
min_pep_len = opts[:min_peptide_length]
|
51
|
+
|
52
|
+
# we only want the top hit per query title (which should ensure that we
|
53
|
+
# get the top hit per scan)
|
54
|
+
hits_by_query_title = Hash.new {|h,k| h[k] = [] }
|
55
|
+
datp_files.zip(tab_txt_files) do |datp_file, tab_txt_file|
|
56
|
+
# build a hash based on the sequence
|
57
|
+
structs = Ms::ErrorRate::Qvalue::Mascot::Percolator.tab_txt( tab_txt_file )
|
58
|
+
qvalue_by_query_rank = {}
|
59
|
+
structs.each do |struct|
|
60
|
+
qvalue_by_query_rank[[struct.query_num, struct.rank]] = struct.q_value
|
61
|
+
end
|
62
|
+
|
63
|
+
base_no_ext = File.basename(datp_file, '.*')
|
64
|
+
Ms::Mascot::Dat.open(datp_file) do |dat|
|
65
|
+
dat.each_peptide_hit(:by => :groups, :yield_nil => false, :with_query => true) do |hits,query|
|
66
|
+
hits.each do |hit|
|
67
|
+
if qval = qvalue_by_query_rank[[hit.query_num, hit.hit_num]]
|
68
|
+
hit_as_struct = Ms::ErrorRate::Qvalue::Mascot::MascotPeptideHit.new(base_no_ext, query.title, query.charge, hit.sequence, hit.score, qval)
|
69
|
+
hits_by_query_title[hit_as_struct.query_title] << hit_as_struct
|
70
|
+
end
|
71
|
+
end
|
72
|
+
end
|
73
|
+
end
|
74
|
+
end
|
75
|
+
|
76
|
+
final_hits = []
|
77
|
+
hits_by_query_title.each do |title, hits|
|
78
|
+
best_hit =
|
79
|
+
if hits.size == 1
|
80
|
+
hits.first
|
81
|
+
else
|
82
|
+
hits.sort_by(&:mowse).last
|
83
|
+
end
|
84
|
+
# FILTER HERE:
|
85
|
+
# ONLY TAKE the BEST HIT IF it passes any filters
|
86
|
+
if min_pep_len
|
87
|
+
next unless best_hit.sequence.size >= min_pep_len
|
88
|
+
end
|
89
|
+
final_hits << best_hit
|
90
|
+
end
|
91
|
+
final_hits
|
92
|
+
end
|
93
|
+
end
|
@@ -0,0 +1,68 @@
|
|
1
|
+
require 'ms/error_rate/qvalue'
|
2
|
+
require 'ms/mascot/dat'
|
3
|
+
|
4
|
+
module Ms
|
5
|
+
module ErrorRate
|
6
|
+
module Qvalue
|
7
|
+
module Mascot
|
8
|
+
end
|
9
|
+
end
|
10
|
+
end
|
11
|
+
end
|
12
|
+
|
13
|
+
|
14
|
+
module Ms::ErrorRate::Qvalue::Mascot
|
15
|
+
MEMBERS = [:filename, :query_title, :charge, :sequence, :mowse, :qvalue]
|
16
|
+
MascotPeptideHit = Struct.new(*MEMBERS) do
|
17
|
+
# emits an array rather than a Struct object
|
18
|
+
def to_yaml(*args)
|
19
|
+
to_a.to_yaml(*args)
|
20
|
+
end
|
21
|
+
end
|
22
|
+
|
23
|
+
module_function
|
24
|
+
# returns an array of Structs of PeptideHit(:filename, :query_title, :charge, :sequence, :mowse, :qvalue)
|
25
|
+
# opts =
|
26
|
+
# :min_peptide_length => Integer
|
27
|
+
def qvalues(target_files, decoy_files, opts={})
|
28
|
+
min_pep_len = opts[:min_peptide_length]
|
29
|
+
|
30
|
+
# we only want the top hit per query title (which should ensure that we
|
31
|
+
# get the top hit per scan)
|
32
|
+
(target_hits, decoy_hits) = [target_files, decoy_files].map do |files|
|
33
|
+
hits_by_query_title = Hash.new {|h,k| h[k] = [] }
|
34
|
+
files.each do |file|
|
35
|
+
base_no_ext = File.basename(file, '.*')
|
36
|
+
Ms::Mascot::Dat.open(file) do |dat|
|
37
|
+
dat.each_peptide_hit(:by => :top, :yield_nil => false, :with_query => true) do |hit,query|
|
38
|
+
|
39
|
+
hit_as_struct = MascotPeptideHit.new(base_no_ext, query.title, query.charge, hit.sequence, hit.score)
|
40
|
+
hits_by_query_title[hit_as_struct.query_title] << hit_as_struct
|
41
|
+
end
|
42
|
+
end
|
43
|
+
end
|
44
|
+
|
45
|
+
final_hits = []
|
46
|
+
hits_by_query_title.each do |title, hits|
|
47
|
+
best_hit =
|
48
|
+
if hits.size == 1
|
49
|
+
hits.first
|
50
|
+
else
|
51
|
+
hits.sort_by(&:mowse).last
|
52
|
+
end
|
53
|
+
# FILTER HERE:
|
54
|
+
# ONLY TAKE the BEST HIT IF it passes any filters
|
55
|
+
if min_pep_len
|
56
|
+
next unless best_hit.sequence.size >= min_pep_len
|
57
|
+
end
|
58
|
+
final_hits << best_hit
|
59
|
+
end
|
60
|
+
final_hits
|
61
|
+
end
|
62
|
+
pairs = Ms::ErrorRate::Qvalue.target_decoy_qvalues(target_hits, decoy_hits, opts, &:mowse)
|
63
|
+
pairs.map do |hit, qval|
|
64
|
+
hit.qvalue = qval
|
65
|
+
hit
|
66
|
+
end
|
67
|
+
end
|
68
|
+
end
|
@@ -0,0 +1,52 @@
|
|
1
|
+
require 'ms/error_rate/qvalue'
|
2
|
+
|
3
|
+
module Ms ; end
|
4
|
+
module Ms::ErrorRate ; end
|
5
|
+
module Ms::ErrorRate::Qvalue ; end
|
6
|
+
|
7
|
+
module Ms::ErrorRate::Qvalue::Pepxml
|
8
|
+
module_function
|
9
|
+
|
10
|
+
# returns an array of hit and qvalue pairs
|
11
|
+
# retrieves the aaseq, charge, and all search_score keys and values for use
|
12
|
+
# in the search_hit. caller must provide a sort_by block, where the best
|
13
|
+
# hits are last. charge is an integer, and all other search scores are cast
|
14
|
+
# as floats. returns the output filename.
|
15
|
+
def target_decoy_qvalues(target_pepxml, decoy_pepxml, opt={}, &sort_by)
|
16
|
+
|
17
|
+
# this is a list of high quality peptide hits associated with each group
|
18
|
+
fields = [:aaseq, :charge]
|
19
|
+
ss_names = []
|
20
|
+
have_ss_names = false
|
21
|
+
(target_hits, decoy_hits) = [target_pepxml, decoy_pepxml].map do |file|
|
22
|
+
# begin with aaseq, charge
|
23
|
+
File.open(file) do |io|
|
24
|
+
doc = Nokogiri::XML.parse(io, nil, nil, Nokogiri::XML::ParseOptions::DEFAULT_XML | Nokogiri::XML::ParseOptions::NOBLANKS)
|
25
|
+
# we can work with namespaces, or just remove them ...
|
26
|
+
doc.remove_namespaces!
|
27
|
+
root = doc.root
|
28
|
+
search_hits = root.xpath('//search_hit')
|
29
|
+
search_hits.map do |search_hit|
|
30
|
+
aaseq = search_hit['peptide']
|
31
|
+
charge = search_hit.parent.parent['assumed_charge'].to_i
|
32
|
+
search_score_nodes = search_hit.children.select {|node| node.name == 'search_score' }
|
33
|
+
ss_values = []
|
34
|
+
search_score_nodes.each do |node|
|
35
|
+
ss_names << node['name'].to_sym unless have_ss_names
|
36
|
+
ss_values << node['value'].to_f
|
37
|
+
end
|
38
|
+
have_ss_names = true
|
39
|
+
[aaseq, charge, *ss_values]
|
40
|
+
end
|
41
|
+
end
|
42
|
+
end
|
43
|
+
|
44
|
+
fields.push(*ss_names)
|
45
|
+
|
46
|
+
peptide_hit_class = Struct.new(*fields)
|
47
|
+
(t_hits, d_hits) = [target_hits, decoy_hits].map {|hits| hits.map {|hit_values| peptide_hit_class.new(*hit_values) } }
|
48
|
+
|
49
|
+
# hit and qvalue pairs
|
50
|
+
Ms::ErrorRate::Qvalue.target_decoy_qvalues(t_hits, d_hits, :z_together => opt[:z_together], &sort_by)
|
51
|
+
end
|
52
|
+
end
|
@@ -0,0 +1,93 @@
|
|
1
|
+
|
2
|
+
require 'set'
|
3
|
+
require 'ms/error_rate/decoy'
|
4
|
+
|
5
|
+
|
6
|
+
class Array
|
7
|
+
def group_by(&block)
|
8
|
+
hash = Hash.new {|h,k| h[k] = [] }
|
9
|
+
each do |v|
|
10
|
+
hash[block.call(v)] << v
|
11
|
+
end
|
12
|
+
hash
|
13
|
+
end unless [].respond_to?(:group_by)
|
14
|
+
end
|
15
|
+
|
16
|
+
module Ms
|
17
|
+
|
18
|
+
module ErrorRate
|
19
|
+
# For generating and working with q-value calculations. The q-value is the global false discovery rate when accepting that particular ID. We do not necessarily distinguish here between *how* the FDR is generated (i.e., Storey's pFDR "the occurrence of false positives" vs. Benjamini-Hochberg's FDR "the rate of false positives" [except to prefer Storey when possible] ). The main point is that we sort and threshold based on a global FDR.
|
20
|
+
module Qvalue
|
21
|
+
module_function
|
22
|
+
|
23
|
+
# returns a parallel array to target hits with qvalues
|
24
|
+
# opts = :z_together true/false (default false) group all charges
|
25
|
+
# together.
|
26
|
+
# the sort block should sort from worst to best
|
27
|
+
# by default, sorting is: {|hit| hit.score} if not provided
|
28
|
+
# options also passed through to mixed_target_decoy
|
29
|
+
def target_decoy_qvalues(target_hits, decoy_hits, opts={}, &sorting)
|
30
|
+
sorting ||= :score
|
31
|
+
opts = {:z_together => false}.merge(opts)
|
32
|
+
target_set = Set.new(target_hits)
|
33
|
+
|
34
|
+
# Proc.new doesn't do arity checking
|
35
|
+
hit_with_qvalue_pairs = Proc.new do |hits|
|
36
|
+
sorted_best_to_worst = (hits.sort_by(&sorting)).reverse
|
37
|
+
(target_hits, qvalues) = Ms::ErrorRate::Qvalue.mixed_target_decoy(sorted_best_to_worst, target_set, opts)
|
38
|
+
target_hits.zip(qvalues)
|
39
|
+
end
|
40
|
+
|
41
|
+
all_together = target_hits + decoy_hits
|
42
|
+
if !opts[:z_together]
|
43
|
+
hit_with_qvalue_pairs.call(all_together)
|
44
|
+
else
|
45
|
+
all_hits = []
|
46
|
+
by_charge = all_together.group_by(&:charge)
|
47
|
+
by_charge.each do |charge,hits|
|
48
|
+
all_hits.push(*(hit_with_qvalue_pairs.call(hits)))
|
49
|
+
end
|
50
|
+
all_hits
|
51
|
+
end
|
52
|
+
end
|
53
|
+
|
54
|
+
# returns [target_hits, qvalues] (parallel arrays sorted from best hit to
|
55
|
+
# worst hit). expects an array-like object of hits sorted from best to worst
|
56
|
+
# hit with decoys interspersed and a target_setlike object that responds to
|
57
|
+
# :include? for the hit object assumes the hit is a decoy if not found
|
58
|
+
# in the target set! if monotonic is false, then the guarantee that
|
59
|
+
# qvalues be monotonically increasing is not respected.
|
60
|
+
def mixed_target_decoy(best_to_worst, target_setlike, opts={})
|
61
|
+
opts = {:monotonic => true}.merge(opts)
|
62
|
+
num_target = 0 ; num_decoy = 0
|
63
|
+
monotonic = opts[:monotonic]
|
64
|
+
target_hits = []
|
65
|
+
qvalues = []
|
66
|
+
best_to_worst.each do |hit|
|
67
|
+
if target_setlike.include?(hit)
|
68
|
+
num_target += 1
|
69
|
+
precision = Ms::ErrorRate::Decoy.precision(num_target, num_decoy)
|
70
|
+
target_hits << hit
|
71
|
+
qvalues << (1.0 - precision)
|
72
|
+
else
|
73
|
+
num_decoy += 1
|
74
|
+
end
|
75
|
+
end
|
76
|
+
if opts[:monotonic]
|
77
|
+
min_qvalue = qvalues.last
|
78
|
+
qvalues = qvalues.reverse.map do |val| # from worst to best score
|
79
|
+
if min_qvalue < val
|
80
|
+
min_qvalue
|
81
|
+
else
|
82
|
+
min_qvalue = val
|
83
|
+
val
|
84
|
+
end
|
85
|
+
end.reverse
|
86
|
+
end
|
87
|
+
[target_hits, qvalues]
|
88
|
+
end
|
89
|
+
|
90
|
+
|
91
|
+
end
|
92
|
+
end
|
93
|
+
end
|
@@ -0,0 +1,30 @@
|
|
1
|
+
require 'ms/error_rate/sbv'
|
2
|
+
|
3
|
+
module Ms
|
4
|
+
module ErrorRate
|
5
|
+
class Sbv
|
6
|
+
# Constraints on aaseq attribute of peptides (the bare amino acid sequence)
|
7
|
+
# works by calculating amino acid frequencies in the fasta file used.
|
8
|
+
class PeptideBased
|
9
|
+
|
10
|
+
def self.generate_hashes(pep_to_prot_file, aa="C", min_num=1 )
|
11
|
+
Ms::ErrorRate::Sbv.generate_hashes(pep_to_prot_file, :type_code => "aa_min#{min_num}") do |pep|
|
12
|
+
if min_num == 1
|
13
|
+
if pep.include?(aa) ; 1
|
14
|
+
else ; 0
|
15
|
+
end
|
16
|
+
else
|
17
|
+
count = 0
|
18
|
+
pep.each_char {|c| count += 1 if c == aa }
|
19
|
+
if count >= min_num ; 1
|
20
|
+
else ; 0
|
21
|
+
end
|
22
|
+
end
|
23
|
+
end
|
24
|
+
end
|
25
|
+
|
26
|
+
end # class
|
27
|
+
end # Sbv
|
28
|
+
end # ER
|
29
|
+
end # Ms
|
30
|
+
|
@@ -0,0 +1,39 @@
|
|
1
|
+
require 'ms/fasta'
|
2
|
+
require 'ms/error_rate/sbv'
|
3
|
+
require 'transmembrane'
|
4
|
+
|
5
|
+
module Ms
|
6
|
+
module ErrorRate
|
7
|
+
class Sbv
|
8
|
+
module ProteinBased
|
9
|
+
DEFAULT_NO_PROTS_VAL = 0.0
|
10
|
+
# note the pep to prot hash has proteins in a string separated by a
|
11
|
+
# hyphen. returns the names of the files written
|
12
|
+
def self.generate_hashes(pep_to_prot_file, protid_to_val, options={})
|
13
|
+
options[:protein_hash] = protid_to_val
|
14
|
+
options[:type_code] = 'tm' unless options[:type_code]
|
15
|
+
files = Ms::ErrorRate::Sbv.generate_hashes(pep_to_prot_file, options) do |prot_return_vals|
|
16
|
+
|
17
|
+
total_with_bias = 0
|
18
|
+
total_known = 0
|
19
|
+
prot_return_vals.each do |val|
|
20
|
+
if !val.nil?
|
21
|
+
total_with_bias += val
|
22
|
+
total_known += 1
|
23
|
+
end
|
24
|
+
end
|
25
|
+
if total_known == 0
|
26
|
+
DEFAULT_NO_PROTS_VAL
|
27
|
+
else
|
28
|
+
total_with_bias.to_f / total_known
|
29
|
+
end
|
30
|
+
end #block
|
31
|
+
|
32
|
+
files
|
33
|
+
|
34
|
+
end # end method
|
35
|
+
end # module
|
36
|
+
end # class
|
37
|
+
end # ErrorRate
|
38
|
+
end # Ms
|
39
|
+
|
@@ -0,0 +1,111 @@
|
|
1
|
+
|
2
|
+
|
3
|
+
module Ms
|
4
|
+
module ErrorRate
|
5
|
+
# Sample Bias Validator
|
6
|
+
class Sbv
|
7
|
+
LENGTH_EXT = 'freq_by_length'
|
8
|
+
AASEQ_EXT = 'by_aaseq'
|
9
|
+
|
10
|
+
# if a protein hash is given, will yield the return an array of
|
11
|
+
# values generated with the value from keying each protein of the
|
12
|
+
# peptide. Otherwise, will yield each peptide in turn
|
13
|
+
def self.generate_hashes(pep_to_prot_file, opts={})
|
14
|
+
op = { :aaseq_ext => AASEQ_EXT,
|
15
|
+
:length_ext => LENGTH_EXT,
|
16
|
+
:file_ext => '.yml',
|
17
|
+
:type_code => '',
|
18
|
+
:protein_hash => nil,
|
19
|
+
:stderr_counter => true,
|
20
|
+
}.merge(opts)
|
21
|
+
|
22
|
+
base = pep_to_prot_file.chomp(File.extname(pep_to_prot_file))
|
23
|
+
freqs = Hash.new {|h,k| h[k] = 0.0 }
|
24
|
+
counts = Hash.new {|h,k| h[k] = 0 }
|
25
|
+
(fileout1, fileout2) = [:aaseq_ext, :length_ext].map do |type_ext|
|
26
|
+
base + '.' + op[:type_code] + '.' + op[type_ext] + op[:file_ext]
|
27
|
+
end
|
28
|
+
protein_hash = op[:protein_hash]
|
29
|
+
pep_count = 0
|
30
|
+
if op[:stderr_counter]
|
31
|
+
$stderr.print "[working, 100,000 peptides = '.'] "
|
32
|
+
$stderr.flush
|
33
|
+
end
|
34
|
+
File.open(fileout1 , 'w') do |out|
|
35
|
+
IO.foreach(pep_to_prot_file) do |line|
|
36
|
+
(pep, prot_string) = line.chomp!.split(': ')
|
37
|
+
|
38
|
+
total_transmembrane = 0
|
39
|
+
total_known = 0
|
40
|
+
answ =
|
41
|
+
if protein_hash
|
42
|
+
yield( protein_hash.values_at(*(prot_string.split('-'))) )
|
43
|
+
else
|
44
|
+
yield(pep)
|
45
|
+
end
|
46
|
+
out.puts "#{pep}: #{answ}"
|
47
|
+
freqs[pep.size] += answ
|
48
|
+
counts[pep.size] += 1
|
49
|
+
pep_count += 1
|
50
|
+
if pep_count % 100000 == 0 && op[:stderr_counter]
|
51
|
+
$stderr.print '.'
|
52
|
+
$stderr.flush
|
53
|
+
end
|
54
|
+
end
|
55
|
+
end
|
56
|
+
$stderr.puts "DONE!" if op[:stderr_counter]
|
57
|
+
avg_freq_ar = {}
|
58
|
+
freqs.each do |k,v|
|
59
|
+
avg_freq_ar[k] = v / counts[k]
|
60
|
+
end
|
61
|
+
File.open(fileout2, 'w') {|out| out.print avg_freq_ar.to_yaml }
|
62
|
+
[fileout1, fileout2]
|
63
|
+
end
|
64
|
+
|
65
|
+
|
66
|
+
# a hash by aaseq giving a value between 0 and 1 telling how much of
|
67
|
+
# an indicator the hit is
|
68
|
+
attr_accessor :indicator_by_aaseq
|
69
|
+
|
70
|
+
attr_accessor :frequency_indicator_opposite
|
71
|
+
|
72
|
+
attr_accessor :size_to_freq
|
73
|
+
|
74
|
+
# boolean
|
75
|
+
attr_accessor :indicators_signify_true_hit
|
76
|
+
|
77
|
+
|
78
|
+
def initialize(indicator_by_aaseq_hash, size_to_freq, frequency_indicator_opposite, indicators_signify_true_hit=true)
|
79
|
+
@indicators_signify_true_hit = indicators_signify_true_hit
|
80
|
+
@frequency_indicator_opposite = frequency_indicator_opposite
|
81
|
+
@indicator_by_aaseq = indicator_by_aaseq_hash
|
82
|
+
@tot_num_indicators = 0.0
|
83
|
+
@tot_num = 0
|
84
|
+
end
|
85
|
+
|
86
|
+
# returns the cumulative precision (fraction of true positives among
|
87
|
+
# total hits) frequency_of_indicators is the probability that a generic
|
88
|
+
# amino acid sequence will be an indicator (this may variable by
|
89
|
+
# sequence length).
|
90
|
+
def update_precision(aaseq)
|
91
|
+
@tot_num_indicators << indicator_by_aaseq[aaseq]
|
92
|
+
@tot_num += 1
|
93
|
+
@frequency_of_indicators_sum += @size_to_freq[aaseq.size]
|
94
|
+
# FP Indicator
|
95
|
+
value = @tot_num_indicators * (1.0 - @frequency_indicator_opposite) * @frequency_of_indicators_sum / (@tot_num**2)
|
96
|
+
precision =
|
97
|
+
if @indicators_signify_true_hit
|
98
|
+
value # a true indicator type (gives precision)
|
99
|
+
else # false indicator type
|
100
|
+
1 - value # 1 - fdr == precision
|
101
|
+
end
|
102
|
+
end
|
103
|
+
|
104
|
+
def calculate_background_frequency
|
105
|
+
@aaseq_to_fraction
|
106
|
+
end
|
107
|
+
|
108
|
+
end
|
109
|
+
|
110
|
+
end
|
111
|
+
end
|