mspire 0.2.4 → 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/INSTALL +1 -0
- data/README +25 -0
- data/Rakefile +129 -40
- data/bin/{find_aa_freq.rb → aafreqs.rb} +2 -2
- data/bin/bioworks_to_pepxml.rb +1 -0
- data/bin/fasta_shaker.rb +1 -96
- data/bin/filter_and_validate.rb +5 -0
- data/bin/{mzxml_to_lmat.rb → ms_to_lmat.rb} +8 -7
- data/bin/prob_validate.rb +6 -0
- data/bin/raw_to_mzXML.rb +2 -2
- data/bin/srf_group.rb +1 -0
- data/bin/srf_to_sqt.rb +40 -0
- data/changelog.txt +68 -0
- data/lib/align/chams.rb +6 -6
- data/lib/align.rb +4 -3
- data/lib/bsearch.rb +120 -0
- data/lib/fasta.rb +318 -86
- data/lib/group_by.rb +10 -0
- data/lib/index_by.rb +11 -0
- data/lib/merge_deep.rb +21 -0
- data/lib/{spec → ms/converter}/mzxml.rb +77 -109
- data/lib/ms/gradient_program.rb +171 -0
- data/lib/ms/msrun.rb +209 -0
- data/lib/{spec/msrun.rb → ms/msrun_index.rb} +7 -40
- data/lib/ms/parser/mzdata/axml.rb +12 -0
- data/lib/ms/parser/mzdata/dom.rb +160 -0
- data/lib/ms/parser/mzdata/libxml.rb +7 -0
- data/lib/ms/parser/mzdata.rb +25 -0
- data/lib/ms/parser/mzxml/axml.rb +11 -0
- data/lib/ms/parser/mzxml/dom.rb +159 -0
- data/lib/ms/parser/mzxml/hpricot.rb +253 -0
- data/lib/ms/parser/mzxml/libxml.rb +15 -0
- data/lib/ms/parser/mzxml/regexp.rb +122 -0
- data/lib/ms/parser/mzxml/rexml.rb +72 -0
- data/lib/ms/parser/mzxml/xmlparser.rb +248 -0
- data/lib/ms/parser/mzxml.rb +175 -0
- data/lib/ms/parser.rb +108 -0
- data/lib/ms/precursor.rb +10 -0
- data/lib/ms/scan.rb +81 -0
- data/lib/ms/spectrum.rb +193 -0
- data/lib/ms.rb +10 -0
- data/lib/mspire.rb +4 -0
- data/lib/roc.rb +61 -1
- data/lib/sample_enzyme.rb +31 -8
- data/lib/scan_i.rb +21 -0
- data/lib/spec_id/aa_freqs.rb +7 -3
- data/lib/spec_id/bioworks.rb +20 -14
- data/lib/spec_id/digestor.rb +139 -0
- data/lib/spec_id/mass.rb +116 -0
- data/lib/spec_id/parser/proph.rb +236 -0
- data/lib/spec_id/precision/filter/cmdline.rb +209 -0
- data/lib/spec_id/precision/filter/interactive.rb +134 -0
- data/lib/spec_id/precision/filter/output.rb +147 -0
- data/lib/spec_id/precision/filter.rb +623 -0
- data/lib/spec_id/precision/output.rb +60 -0
- data/lib/spec_id/precision/prob/cmdline.rb +139 -0
- data/lib/spec_id/precision/prob/output.rb +88 -0
- data/lib/spec_id/precision/prob.rb +171 -0
- data/lib/spec_id/proph/pep_summary.rb +92 -0
- data/lib/spec_id/proph/prot_summary.rb +484 -0
- data/lib/spec_id/proph.rb +2 -466
- data/lib/spec_id/protein_summary.rb +2 -2
- data/lib/spec_id/sequest/params.rb +316 -0
- data/lib/spec_id/sequest/pepxml.rb +1513 -0
- data/lib/spec_id/sequest.rb +2 -1672
- data/lib/spec_id/srf.rb +445 -177
- data/lib/spec_id.rb +183 -95
- data/lib/spec_id_xml.rb +8 -10
- data/lib/transmem/phobius.rb +147 -0
- data/lib/transmem/toppred.rb +368 -0
- data/lib/transmem.rb +157 -0
- data/lib/validator/aa.rb +135 -0
- data/lib/validator/background.rb +73 -0
- data/lib/validator/bias.rb +95 -0
- data/lib/validator/cmdline.rb +260 -0
- data/lib/validator/decoy.rb +94 -0
- data/lib/validator/digestion_based.rb +69 -0
- data/lib/validator/probability.rb +48 -0
- data/lib/validator/prot_from_pep.rb +234 -0
- data/lib/validator/transmem.rb +272 -0
- data/lib/validator/true_pos.rb +46 -0
- data/lib/validator.rb +214 -0
- data/lib/xml.rb +38 -0
- data/lib/xml_style_parser.rb +105 -0
- data/lib/xmlparser_wrapper.rb +19 -0
- data/script/compile_and_plot_smriti_final.rb +97 -0
- data/script/extract_gradient_programs.rb +56 -0
- data/script/get_apex_values_rexml.rb +44 -0
- data/script/mzXML2timeIndex.rb +1 -1
- data/script/smriti_final_analysis.rb +103 -0
- data/script/toppred_to_yaml.rb +47 -0
- data/script/tpp_installer.rb +1 -1
- data/{test/tc_align.rb → specs/align_spec.rb} +21 -27
- data/{test/tc_bioworks_to_pepxml.rb → specs/bin/bioworks_to_pepxml_spec.rb} +25 -41
- data/specs/bin/fasta_shaker_spec.rb +259 -0
- data/specs/bin/filter_and_validate__multiple_vals_helper.yaml +202 -0
- data/specs/bin/filter_and_validate_spec.rb +124 -0
- data/specs/bin/ms_to_lmat_spec.rb +34 -0
- data/specs/bin/prob_validate_spec.rb +62 -0
- data/specs/bin/protein_summary_spec.rb +10 -0
- data/{test/tc_fasta.rb → specs/fasta_spec.rb} +354 -310
- data/specs/gi_spec.rb +22 -0
- data/specs/load_bin_path.rb +7 -0
- data/specs/merge_deep_spec.rb +13 -0
- data/specs/ms/gradient_program_spec.rb +77 -0
- data/specs/ms/msrun_spec.rb +455 -0
- data/specs/ms/parser_spec.rb +92 -0
- data/specs/ms/spectrum_spec.rb +89 -0
- data/specs/roc_spec.rb +251 -0
- data/specs/rspec_autotest.rb +149 -0
- data/specs/sample_enzyme_spec.rb +41 -0
- data/specs/spec_helper.rb +133 -0
- data/specs/spec_id/aa_freqs_spec.rb +52 -0
- data/{test/tc_bioworks.rb → specs/spec_id/bioworks_spec.rb} +56 -71
- data/specs/spec_id/digestor_spec.rb +75 -0
- data/specs/spec_id/precision/filter/cmdline_spec.rb +20 -0
- data/specs/spec_id/precision/filter/output_spec.rb +31 -0
- data/specs/spec_id/precision/filter_spec.rb +243 -0
- data/specs/spec_id/precision/prob_spec.rb +111 -0
- data/specs/spec_id/precision/prob_spec_helper.rb +0 -0
- data/specs/spec_id/proph/pep_summary_spec.rb +143 -0
- data/{test/tc_proph.rb → specs/spec_id/proph/prot_summary_spec.rb} +52 -32
- data/{test/tc_protein_summary.rb → specs/spec_id/protein_summary_spec.rb} +85 -0
- data/specs/spec_id/sequest/params_spec.rb +68 -0
- data/specs/spec_id/sequest/pepxml_spec.rb +452 -0
- data/specs/spec_id/sqt_spec.rb +138 -0
- data/specs/spec_id/srf_spec.rb +209 -0
- data/specs/spec_id/srf_spec_helper.rb +302 -0
- data/specs/spec_id_helper.rb +33 -0
- data/specs/spec_id_spec.rb +361 -0
- data/specs/spec_id_xml_spec.rb +33 -0
- data/specs/transmem/phobius_spec.rb +423 -0
- data/specs/transmem/toppred_spec.rb +297 -0
- data/specs/transmem_spec.rb +60 -0
- data/specs/transmem_spec_shared.rb +64 -0
- data/specs/validator/aa_spec.rb +107 -0
- data/specs/validator/background_spec.rb +51 -0
- data/specs/validator/bias_spec.rb +146 -0
- data/specs/validator/decoy_spec.rb +51 -0
- data/specs/validator/fasta_helper.rb +26 -0
- data/specs/validator/prot_from_pep_spec.rb +141 -0
- data/specs/validator/transmem_spec.rb +145 -0
- data/specs/validator/true_pos_spec.rb +58 -0
- data/specs/validator_helper.rb +33 -0
- data/specs/xml_spec.rb +12 -0
- data/test_files/000_pepxml18_small.xml +206 -0
- data/test_files/020a.mzXML.timeIndex +4710 -0
- data/test_files/4-03-03_mzXML/000.mzXML.timeIndex +3973 -0
- data/test_files/4-03-03_mzXML/020.mzXML.timeIndex +3872 -0
- data/test_files/4-03-03_small-prot.xml +321 -0
- data/test_files/4-03-03_small.xml +3876 -0
- data/test_files/7MIX_STD_110802_1.sequest_params_fragment.srf +0 -0
- data/test_files/bioworks-3.3_10prots.xml +5999 -0
- data/test_files/bioworks31.params +77 -0
- data/test_files/bioworks32.params +62 -0
- data/test_files/bioworks33.params +63 -0
- data/test_files/bioworks_single_run_small.xml +7237 -0
- data/test_files/bioworks_small.fasta +212 -0
- data/test_files/bioworks_small.params +63 -0
- data/test_files/bioworks_small.phobius +109 -0
- data/test_files/bioworks_small.toppred.out +2847 -0
- data/test_files/bioworks_small.xml +5610 -0
- data/test_files/bioworks_with_INV_small.xml +3753 -0
- data/test_files/bioworks_with_SHUFF_small.xml +2503 -0
- data/test_files/corrupted_900.srf +0 -0
- data/test_files/head_of_7MIX.srf +0 -0
- data/test_files/interact-opd1_mods_small-prot.xml +304 -0
- data/test_files/messups.fasta +297 -0
- data/test_files/opd1/000.my_answer.100lines.xml +101 -0
- data/test_files/opd1/000.tpp_1.2.3.first10.xml +115 -0
- data/test_files/opd1/000.tpp_2.9.2.first10.xml +126 -0
- data/test_files/opd1/000.v2.1.mzXML.timeIndex +3748 -0
- data/test_files/opd1/000_020-prot.png +0 -0
- data/test_files/opd1/000_020_3prots-prot.mod_initprob.xml +62 -0
- data/test_files/opd1/000_020_3prots-prot.xml +62 -0
- data/test_files/opd1/opd1_cat_inv_small-prot.xml +139 -0
- data/test_files/opd1/sequest.3.1.params +77 -0
- data/test_files/opd1/sequest.3.2.params +62 -0
- data/test_files/opd1/twenty_scans.mzXML +418 -0
- data/test_files/opd1/twenty_scans.v2.1.mzXML +382 -0
- data/test_files/opd1/twenty_scans_answ.lmat +0 -0
- data/test_files/opd1/twenty_scans_answ.lmata +9 -0
- data/test_files/opd1_020_beginning.RAW +0 -0
- data/test_files/opd1_2runs_2mods/interact-opd1_mods__small.xml +753 -0
- data/test_files/orbitrap_mzData/000_cut.xml +1920 -0
- data/test_files/pepproph_small.xml +4691 -0
- data/test_files/phobius.small.noheader.txt +50 -0
- data/test_files/phobius.small.small.txt +53 -0
- data/test_files/s01_anC1_ld020mM.key.txt +25 -0
- data/test_files/s01_anC1_ld020mM.meth +0 -0
- data/test_files/small.fasta +297 -0
- data/test_files/smallraw.RAW +0 -0
- data/test_files/tf_bioworks2excel.bioXML +14340 -0
- data/test_files/tf_bioworks2excel.txt.actual +1035 -0
- data/test_files/toppred.small.out +416 -0
- data/test_files/toppred.xml.out +318 -0
- data/test_files/validator_hits_separate/bias_bioworks_small_HS.fasta +7 -0
- data/test_files/validator_hits_separate/bioworks_small_HS.xml +5651 -0
- data/test_files/yeast_gly_small-prot.xml +265 -0
- data/test_files/yeast_gly_small.1.0_1.0_1.0.parentTimes +6 -0
- data/test_files/yeast_gly_small.xml +3807 -0
- data/test_files/yeast_gly_small2.parentTimes +6 -0
- metadata +273 -57
- data/bin/filter.rb +0 -6
- data/bin/precision.rb +0 -5
- data/lib/spec/mzdata/parser.rb +0 -108
- data/lib/spec/mzdata.rb +0 -48
- data/lib/spec/mzxml/parser.rb +0 -449
- data/lib/spec/scan.rb +0 -55
- data/lib/spec_id/filter.rb +0 -797
- data/lib/spec_id/precision.rb +0 -421
- data/lib/toppred.rb +0 -18
- data/script/filter-peps.rb +0 -164
- data/test/tc_aa_freqs.rb +0 -59
- data/test/tc_fasta_shaker.rb +0 -149
- data/test/tc_filter.rb +0 -203
- data/test/tc_filter_peps.rb +0 -46
- data/test/tc_gi.rb +0 -17
- data/test/tc_id_class_anal.rb +0 -70
- data/test/tc_id_precision.rb +0 -89
- data/test/tc_msrun.rb +0 -88
- data/test/tc_mzxml.rb +0 -88
- data/test/tc_mzxml_to_lmat.rb +0 -36
- data/test/tc_peptide_parent_times.rb +0 -27
- data/test/tc_precision.rb +0 -60
- data/test/tc_roc.rb +0 -166
- data/test/tc_sample_enzyme.rb +0 -32
- data/test/tc_scan.rb +0 -26
- data/test/tc_sequest.rb +0 -336
- data/test/tc_spec.rb +0 -78
- data/test/tc_spec_id.rb +0 -201
- data/test/tc_spec_id_xml.rb +0 -36
- data/test/tc_srf.rb +0 -262
data/lib/ms/parser.rb
ADDED
|
@@ -0,0 +1,108 @@
|
|
|
1
|
+
require 'xml_style_parser'
|
|
2
|
+
|
|
3
|
+
module MS; end
|
|
4
|
+
|
|
5
|
+
module MS::Parser
|
|
6
|
+
# inherits attr_accessor :method, :default_parser, and parse (which should
|
|
7
|
+
# be overridden)
|
|
8
|
+
include XMLStyleParser
|
|
9
|
+
|
|
10
|
+
Mzxml_regexp = /http:\/\/sashimi.sourceforge.net\/schema(_revision)?\/([\w\d_\.]+)/o
|
|
11
|
+
# 'http://sashimi.sourceforge.net/schema/MsXML.xsd' # version 1
|
|
12
|
+
# 'http://sashimi.sourceforge.net/schema_revision/mzXML_X.X' # others
|
|
13
|
+
Mzdata_regexp = /<mzData.*version="([\d\.]+)"/m
|
|
14
|
+
|
|
15
|
+
attr_accessor :version
|
|
16
|
+
|
|
17
|
+
############################################
|
|
18
|
+
# POINTERS (to create META MAGIC)
|
|
19
|
+
############################################
|
|
20
|
+
|
|
21
|
+
@@filetypes_to_upcase = {
|
|
22
|
+
:mzxml => 'MzXML',
|
|
23
|
+
:mzdata => 'MzData',
|
|
24
|
+
:mzml => 'MzML',
|
|
25
|
+
:raw => 'Raw',
|
|
26
|
+
}
|
|
27
|
+
|
|
28
|
+
@@filetypes_to_require = {}
|
|
29
|
+
@@filetypes_to_constant = {}
|
|
30
|
+
|
|
31
|
+
abbrevs = Dir.chdir(File.dirname(__FILE__) + "/parser") do
|
|
32
|
+
Dir["*.rb"].map {|f| f.sub(/\.rb$/,'') }
|
|
33
|
+
end
|
|
34
|
+
abbrevs.each do |abbr|
|
|
35
|
+
abb = abbr.to_sym
|
|
36
|
+
req = ['ms', 'parser', abbr].join("/")
|
|
37
|
+
@@filetypes_to_require[abb] = req
|
|
38
|
+
@@filetypes_to_constant[abb] = ['MS', 'Parser', @@filetypes_to_upcase[abb]].join("::")
|
|
39
|
+
end
|
|
40
|
+
|
|
41
|
+
############################################
|
|
42
|
+
# END POINTERS
|
|
43
|
+
############################################
|
|
44
|
+
|
|
45
|
+
# finds the filetype of a file (expects to be at the beginning) and rewinds
|
|
46
|
+
# the filehandle to the beginning returns [filetype, version]. nil if
|
|
47
|
+
# filetype and version could not be determined
|
|
48
|
+
def self.filetype_and_version(fh_or_filename)
|
|
49
|
+
if fh_or_filename.is_a? IO
|
|
50
|
+
fh = fh_or_filename
|
|
51
|
+
found = nil
|
|
52
|
+
# Test for RAW file:
|
|
53
|
+
header = fh.read(18).unpack('@2axaxaxaxaxaxaxa').join
|
|
54
|
+
if header == 'Finnigan'
|
|
55
|
+
return [:raw, nil]
|
|
56
|
+
end
|
|
57
|
+
fh.rewind
|
|
58
|
+
while (line = fh.gets)
|
|
59
|
+
found =
|
|
60
|
+
case line
|
|
61
|
+
when Mzxml_regexp
|
|
62
|
+
mtch = $2.dup
|
|
63
|
+
case mtch
|
|
64
|
+
when /mzXML_([\d\.]+)/
|
|
65
|
+
[:mzxml, $1.dup]
|
|
66
|
+
when /MsXML/
|
|
67
|
+
[:mzxml, '1.0']
|
|
68
|
+
else
|
|
69
|
+
abort "Cannot determine mzXML version!"
|
|
70
|
+
end
|
|
71
|
+
when Mzdata_regexp
|
|
72
|
+
[:mzdata, $1.dup]
|
|
73
|
+
end
|
|
74
|
+
if found
|
|
75
|
+
break
|
|
76
|
+
end
|
|
77
|
+
end
|
|
78
|
+
fh.rewind
|
|
79
|
+
found
|
|
80
|
+
else
|
|
81
|
+
File.open(fh_or_filename) do |fh|
|
|
82
|
+
filetype_and_version(fh)
|
|
83
|
+
end
|
|
84
|
+
end
|
|
85
|
+
end
|
|
86
|
+
|
|
87
|
+
# filetype_version is an example file to parse, or it is an array: [type, version].
|
|
88
|
+
# parse_type is the information to be gleaned (as symbol).
|
|
89
|
+
def self.new(filetype_version, parse_type)
|
|
90
|
+
unless filetype_version.is_a? Array
|
|
91
|
+
filetype_version = filetype_and_version(filetype_version)
|
|
92
|
+
end
|
|
93
|
+
require_and_create_parser(filetype_version, parse_type)
|
|
94
|
+
end
|
|
95
|
+
|
|
96
|
+
private
|
|
97
|
+
|
|
98
|
+
# returns a working parser.
|
|
99
|
+
def self.require_and_create_parser(filetype_version, parse_type)
|
|
100
|
+
(filetype, version) = filetype_version
|
|
101
|
+
#puts "FT: #{filetype} VERSION: #{version}"
|
|
102
|
+
reply = require @@filetypes_to_require[filetype]
|
|
103
|
+
@@filetypes_to_require[filetype]
|
|
104
|
+
parser_class = MS::Parser.const_get(@@filetypes_to_upcase[filetype])
|
|
105
|
+
parser_class.new(parse_type, version)
|
|
106
|
+
end
|
|
107
|
+
|
|
108
|
+
end
|
data/lib/ms/precursor.rb
ADDED
data/lib/ms/scan.rb
ADDED
|
@@ -0,0 +1,81 @@
|
|
|
1
|
+
require 'array_class'
|
|
2
|
+
require 'ms/precursor'
|
|
3
|
+
|
|
4
|
+
module MS ; end
|
|
5
|
+
|
|
6
|
+
# 0 1 2 3 4 5 6
|
|
7
|
+
MS::Scan = ArrayClass.new( %w(num ms_level time start_mz end_mz precursors spectrum) )
|
|
8
|
+
|
|
9
|
+
# time in seconds
|
|
10
|
+
# everything else in float/int or as array (precursors)
|
|
11
|
+
|
|
12
|
+
class MS::Scan
|
|
13
|
+
#@@order = %w(num ms_level time start_mz end_mz prec_mz prec_inten parent spectrum)
|
|
14
|
+
#attr_accessor :num, :ms_level, :time, :start_mz, :end_mz, :prec_mz, :prec_inten, :parent, :spectrum
|
|
15
|
+
|
|
16
|
+
#def initialize(ar=nil)
|
|
17
|
+
# @@order.zip(ar) do |x,v|
|
|
18
|
+
# send((x+'=').to_sym, v)
|
|
19
|
+
# end
|
|
20
|
+
#end
|
|
21
|
+
|
|
22
|
+
def to_s
|
|
23
|
+
"<Scan num=#{num} ms_level=#{ms_level} time=#{time}>"
|
|
24
|
+
end
|
|
25
|
+
|
|
26
|
+
undef_method :inspect
|
|
27
|
+
def inspect
|
|
28
|
+
atts = %w(num ms_level time start_mz end_mz)
|
|
29
|
+
display = atts.map do |att|
|
|
30
|
+
if val = send(att.to_sym)
|
|
31
|
+
"@#{att}=#{val}"
|
|
32
|
+
else
|
|
33
|
+
nil
|
|
34
|
+
end
|
|
35
|
+
end
|
|
36
|
+
display.compact!
|
|
37
|
+
spec_display =
|
|
38
|
+
if spectrum
|
|
39
|
+
spectrum.mz.size
|
|
40
|
+
else
|
|
41
|
+
nil
|
|
42
|
+
end
|
|
43
|
+
"<MS::Scan:#{__id__} " + display.join(", ") + "@precursors=#{precursors.inspect}" + "@spectrum=size:#{spec_display}" + ">"
|
|
44
|
+
end
|
|
45
|
+
|
|
46
|
+
# returns the string (space delimited): "ms_level num time [prec_mz prec_inten]"
|
|
47
|
+
def to_index_file_string
|
|
48
|
+
arr = [ms_level, num, time]
|
|
49
|
+
if precursors then arr << precursors.first.mz end
|
|
50
|
+
if x = precursors.first.inten then arr << x end
|
|
51
|
+
arr.join(" ")
|
|
52
|
+
end
|
|
53
|
+
|
|
54
|
+
# adds the attribute parent to each scan with a parent
|
|
55
|
+
# (level 1 = no parent; level 2 = prev level 1, etc.
|
|
56
|
+
def self.add_parent_scan(scans)
|
|
57
|
+
prev_scan = nil
|
|
58
|
+
parent_stack = [nil]
|
|
59
|
+
## we want to set the level to be the first mslevel we come to
|
|
60
|
+
prev_level = 1
|
|
61
|
+
scans.each do |scan|
|
|
62
|
+
if scan then prev_level = scan.ms_level; break; end
|
|
63
|
+
end
|
|
64
|
+
scans.each do |scan|
|
|
65
|
+
next unless scan ## the first one is nil, (others?)
|
|
66
|
+
level = scan.ms_level
|
|
67
|
+
if prev_level < level
|
|
68
|
+
parent_stack.unshift prev_scan
|
|
69
|
+
end
|
|
70
|
+
if prev_level > level
|
|
71
|
+
(prev_level - level).times do parent_stack.shift end
|
|
72
|
+
end
|
|
73
|
+
scan.parent = parent_stack.first
|
|
74
|
+
prev_level = level
|
|
75
|
+
prev_scan = scan
|
|
76
|
+
end
|
|
77
|
+
end
|
|
78
|
+
|
|
79
|
+
end
|
|
80
|
+
|
|
81
|
+
|
data/lib/ms/spectrum.rb
ADDED
|
@@ -0,0 +1,193 @@
|
|
|
1
|
+
require 'base64'
|
|
2
|
+
require 'bsearch'
|
|
3
|
+
require 'ms'
|
|
4
|
+
|
|
5
|
+
class MS::Spectrum
|
|
6
|
+
|
|
7
|
+
Unpack_network_float = 'g*'
|
|
8
|
+
Unpack_network_double = 'G*'
|
|
9
|
+
Unpack_little_endian_float = 'e*'
|
|
10
|
+
Unpack_little_endian_double = 'E*'
|
|
11
|
+
|
|
12
|
+
# m/z's
|
|
13
|
+
attr_accessor :mz
|
|
14
|
+
# intensities
|
|
15
|
+
attr_accessor :intensity
|
|
16
|
+
|
|
17
|
+
def initialize(mz=[], intensity=[])
|
|
18
|
+
@mz = mz
|
|
19
|
+
@intensity = intensity
|
|
20
|
+
end
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
def has_mz_data?
|
|
24
|
+
(@mz.size > 0) && (@mz.first.is_a?(Numeric))
|
|
25
|
+
end
|
|
26
|
+
|
|
27
|
+
def has_intensity_data?
|
|
28
|
+
(@intensity.size > 0) && (@intensity.first.is_a?(Numeric))
|
|
29
|
+
end
|
|
30
|
+
|
|
31
|
+
# takes a base64 string and returns an array
|
|
32
|
+
def self.base64_to_array(string, precision=32, network_order=true)
|
|
33
|
+
b64d = Base64.decode64(string)
|
|
34
|
+
unpack_code =
|
|
35
|
+
if network_order
|
|
36
|
+
if precision == 32
|
|
37
|
+
Unpack_network_float
|
|
38
|
+
elsif precision == 64
|
|
39
|
+
Unpack_network_double
|
|
40
|
+
end
|
|
41
|
+
else ## little endian
|
|
42
|
+
if precision == 32
|
|
43
|
+
Unpack_little_endian_float
|
|
44
|
+
elsif precision == 64
|
|
45
|
+
Unpack_little_endian_double
|
|
46
|
+
end
|
|
47
|
+
end
|
|
48
|
+
b64d.unpack(unpack_code)
|
|
49
|
+
end
|
|
50
|
+
|
|
51
|
+
def self.from_base64_pair(mz_string, mz_precision, mz_network_order, inten_string, inten_precision, inten_network_order)
|
|
52
|
+
mz = base64_to_array(mz_string, mz_precision, mz_network_order)
|
|
53
|
+
inten = base64_to_array(inten_string, inten_precision, inten_network_order)
|
|
54
|
+
self.new(mz, inten)
|
|
55
|
+
end
|
|
56
|
+
|
|
57
|
+
# takes a base64 peaks string and sets spectrum
|
|
58
|
+
# returns self for chaining
|
|
59
|
+
def self.from_base64_peaks(string, precision=32, network_order=true)
|
|
60
|
+
data = base64_to_array(string, precision, network_order)
|
|
61
|
+
sz = data.size/2
|
|
62
|
+
mz = Array.new(sz)
|
|
63
|
+
intensity = Array.new(sz)
|
|
64
|
+
ndata = []
|
|
65
|
+
my_ind = 0
|
|
66
|
+
data.each_with_index do |dat,ind|
|
|
67
|
+
if (ind % 2) == 0 # even
|
|
68
|
+
mz[my_ind] = dat
|
|
69
|
+
else
|
|
70
|
+
intensity[my_ind] = dat
|
|
71
|
+
my_ind += 1
|
|
72
|
+
end
|
|
73
|
+
end
|
|
74
|
+
self.new(mz, intensity)
|
|
75
|
+
end
|
|
76
|
+
|
|
77
|
+
|
|
78
|
+
|
|
79
|
+
|
|
80
|
+
######
|
|
81
|
+
# NOT REALLY USING RIGHT NOW:
|
|
82
|
+
######
|
|
83
|
+
|
|
84
|
+
# takes a base64 peaks string and returns an array of [m/z,intens] doublets
|
|
85
|
+
# mzXML as network ordered
|
|
86
|
+
def base64_peaks_to_pairs(string, precision=32)
|
|
87
|
+
data = base64_peaks_to_array(string, precision)
|
|
88
|
+
ndata = []
|
|
89
|
+
data.each_with_index do |dat,ind|
|
|
90
|
+
if (ind % 2) == 0 # even
|
|
91
|
+
arr = Array.new(2)
|
|
92
|
+
arr[0] = dat
|
|
93
|
+
ndata.push( arr )
|
|
94
|
+
else
|
|
95
|
+
ndata.last[1] = dat
|
|
96
|
+
end
|
|
97
|
+
end
|
|
98
|
+
ndata
|
|
99
|
+
end
|
|
100
|
+
|
|
101
|
+
# returns the index of the first value matching that m/z. the argument m/z
|
|
102
|
+
# may be less precise than the actual m/z (rounding to the same precision
|
|
103
|
+
# given) but must be at least integer precision (after rounding)
|
|
104
|
+
# implemented as binary search (bsearch from the web)
|
|
105
|
+
def index(mz)
|
|
106
|
+
return_val = nil
|
|
107
|
+
ind = @mz.bsearch_lower_boundary{|x| x <=> mz }
|
|
108
|
+
if @mz[ind] == mz
|
|
109
|
+
return_val = ind
|
|
110
|
+
else
|
|
111
|
+
# do a rounding game to see which one is it, or nil
|
|
112
|
+
# find all the values rounding to the same integer in the locale
|
|
113
|
+
# test each one fully in turn
|
|
114
|
+
mz = mz.to_f
|
|
115
|
+
mz_size = @mz.size
|
|
116
|
+
if ((ind < mz_size) and equal_after_rounding?(@mz[ind], mz))
|
|
117
|
+
return_val = ind
|
|
118
|
+
else # run the loop
|
|
119
|
+
up = ind
|
|
120
|
+
loop do
|
|
121
|
+
up += 1
|
|
122
|
+
if up >= mz_size
|
|
123
|
+
break
|
|
124
|
+
end
|
|
125
|
+
mz_up = @mz[up]
|
|
126
|
+
if (mz_up.ceil - mz.ceil >= 2)
|
|
127
|
+
break
|
|
128
|
+
else
|
|
129
|
+
if equal_after_rounding?(mz_up, mz)
|
|
130
|
+
return_val = up
|
|
131
|
+
return return_val
|
|
132
|
+
end
|
|
133
|
+
end
|
|
134
|
+
end
|
|
135
|
+
dn= ind
|
|
136
|
+
loop do
|
|
137
|
+
dn -= 1
|
|
138
|
+
if dn < 0
|
|
139
|
+
break
|
|
140
|
+
end
|
|
141
|
+
mz_dn = @mz[dn]
|
|
142
|
+
if (mz.floor - mz_dn.floor >= 2)
|
|
143
|
+
break
|
|
144
|
+
else
|
|
145
|
+
if equal_after_rounding?(mz_dn, mz)
|
|
146
|
+
return_val = dn
|
|
147
|
+
return return_val
|
|
148
|
+
end
|
|
149
|
+
end
|
|
150
|
+
end
|
|
151
|
+
end
|
|
152
|
+
end
|
|
153
|
+
return_val
|
|
154
|
+
end
|
|
155
|
+
|
|
156
|
+
# uses index function and returns the intensity at that value
|
|
157
|
+
def intensity_at_mz(mz)
|
|
158
|
+
if x = index(mz)
|
|
159
|
+
@intensity[x]
|
|
160
|
+
else
|
|
161
|
+
nil
|
|
162
|
+
end
|
|
163
|
+
end
|
|
164
|
+
|
|
165
|
+
# less_precise should be a float
|
|
166
|
+
# precise should be a float
|
|
167
|
+
def equal_after_rounding?(precise, less_precise)
|
|
168
|
+
# determine the precision of less_precise
|
|
169
|
+
exp10 = precision_as_neg_int(less_precise)
|
|
170
|
+
#puts "EXP10: #{exp10}"
|
|
171
|
+
answ = ((precise*exp10).round == (less_precise*exp10).round)
|
|
172
|
+
#puts "TESTING FOR EQUAL: #{precise} #{less_precise}"
|
|
173
|
+
#puts answ
|
|
174
|
+
(precise*exp10).round == (less_precise*exp10).round
|
|
175
|
+
end
|
|
176
|
+
|
|
177
|
+
# returns 1 for ones place, 10 for tenths, 100 for hundredths
|
|
178
|
+
# to a precision exceeding 1e-6
|
|
179
|
+
def precision_as_neg_int(float)
|
|
180
|
+
neg_exp10 = 1
|
|
181
|
+
loop do
|
|
182
|
+
over = float * neg_exp10
|
|
183
|
+
rounded = over.round
|
|
184
|
+
if (over - rounded).abs <= 1e-6
|
|
185
|
+
break
|
|
186
|
+
end
|
|
187
|
+
neg_exp10 *= 10
|
|
188
|
+
end
|
|
189
|
+
neg_exp10
|
|
190
|
+
end
|
|
191
|
+
|
|
192
|
+
end
|
|
193
|
+
|
data/lib/ms.rb
ADDED
data/lib/mspire.rb
ADDED
data/lib/roc.rb
CHANGED
|
@@ -42,10 +42,19 @@ class ROC
|
|
|
42
42
|
area
|
|
43
43
|
end
|
|
44
44
|
|
|
45
|
+
# takes two lists of values and makes doublets [[val, boolean],...]
|
|
46
|
+
def separate_to_doublets(tps, fps)
|
|
47
|
+
true_doublets = tps.map {|v| [v, 0] }
|
|
48
|
+
false_doublets = fps.map {|v| [v, 1] }
|
|
49
|
+
all_doublets = true_doublets + false_doublets
|
|
50
|
+
all_doublets.sort!
|
|
51
|
+
all_doublets.map {|v| ((v[1] == 0) ? [v[0], true] : [v[0], false]) }
|
|
52
|
+
end
|
|
53
|
+
|
|
45
54
|
# given an array of doublets where each doublet is a value and a boolean,
|
|
46
55
|
# sorts the list and divides it into two arrays (tps, fps) of the values.
|
|
47
56
|
# The output can then be fed into many of the other routines.
|
|
48
|
-
def
|
|
57
|
+
def doublets_to_separate(list)
|
|
49
58
|
tp = []; fp = []
|
|
50
59
|
list.each do |dbl|
|
|
51
60
|
if dbl[1]
|
|
@@ -85,6 +94,27 @@ class ROC
|
|
|
85
94
|
end
|
|
86
95
|
return x, y
|
|
87
96
|
end
|
|
97
|
+
|
|
98
|
+
# takes previously sorted doublets [value, boolean]
|
|
99
|
+
def numhits_and_ppv(doublets)
|
|
100
|
+
x = []
|
|
101
|
+
y = []
|
|
102
|
+
tps = 0
|
|
103
|
+
fps = 0
|
|
104
|
+
doublets.each_with_index do |d,i|
|
|
105
|
+
if d[1] ; tps += 1
|
|
106
|
+
else ; fps += 1 end
|
|
107
|
+
|
|
108
|
+
if (i+1 == doublets.size) || (d[0] != doublets[i+1][0])
|
|
109
|
+
num_hits = tps + fps
|
|
110
|
+
x << num_hits
|
|
111
|
+
y << tps.to_f/num_hits
|
|
112
|
+
end
|
|
113
|
+
end
|
|
114
|
+
[x, y]
|
|
115
|
+
end
|
|
116
|
+
|
|
117
|
+
|
|
88
118
|
end
|
|
89
119
|
|
|
90
120
|
# For calculating precision given lists of hits and decoy hits. The hits are
|
|
@@ -124,4 +154,34 @@ class DecoyROC < ROC
|
|
|
124
154
|
[num_hits_ar, num_tps_ar, ppv_ar]
|
|
125
155
|
end
|
|
126
156
|
|
|
157
|
+
# returns [num_hits, precision] as a function of num hits. decoy hits are
|
|
158
|
+
# seen merely as indicators of the number of false hits in the dataset.
|
|
159
|
+
# This is the same algorithm as pred_and_tps_and_ppv, just eliminates
|
|
160
|
+
# uneeded calcs
|
|
161
|
+
def pred_and_ppv(hits, decoy_hits)
|
|
162
|
+
hits_i = 0
|
|
163
|
+
decoy_i = 0
|
|
164
|
+
|
|
165
|
+
num_hits_ar = []
|
|
166
|
+
ppv_ar = []
|
|
167
|
+
|
|
168
|
+
while hits_i < hits.size
|
|
169
|
+
while decoy_i < decoy_hits.size && hits[hits_i] >= decoy_hits[decoy_i]
|
|
170
|
+
decoy_i += 1
|
|
171
|
+
end
|
|
172
|
+
unless hits[hits_i] == hits[hits_i+1]
|
|
173
|
+
## determine the number of false positives
|
|
174
|
+
tot_num_hits = hits_i+1
|
|
175
|
+
num_tps = tot_num_hits - decoy_i
|
|
176
|
+
|
|
177
|
+
num_hits_ar << tot_num_hits
|
|
178
|
+
ppv_ar << ( num_tps.to_f/tot_num_hits )
|
|
179
|
+
|
|
180
|
+
end
|
|
181
|
+
hits_i += 1
|
|
182
|
+
end
|
|
183
|
+
[num_hits_ar, ppv_ar]
|
|
184
|
+
|
|
185
|
+
end
|
|
186
|
+
|
|
127
187
|
end
|
data/lib/sample_enzyme.rb
CHANGED
|
@@ -1,9 +1,12 @@
|
|
|
1
1
|
|
|
2
2
|
module SpecIDXML; end
|
|
3
3
|
|
|
4
|
-
require 'spec_id_xml'
|
|
5
4
|
require 'strscan'
|
|
6
5
|
|
|
6
|
+
require 'spec_id_xml'
|
|
7
|
+
require 'spec_id'
|
|
8
|
+
|
|
9
|
+
|
|
7
10
|
class SampleEnzyme
|
|
8
11
|
include SpecIDXML
|
|
9
12
|
|
|
@@ -18,6 +21,7 @@ class SampleEnzyme
|
|
|
18
21
|
# Currently, recognize:
|
|
19
22
|
# trypsin
|
|
20
23
|
# For other enzymes, you must set :cut, :no_cut, :name, and :sense
|
|
24
|
+
# will yield the object if you want to set the values that way
|
|
21
25
|
def initialize(name=nil)
|
|
22
26
|
@sense = nil
|
|
23
27
|
@cut = nil
|
|
@@ -25,11 +29,14 @@ class SampleEnzyme
|
|
|
25
29
|
@name = name
|
|
26
30
|
if @name
|
|
27
31
|
# set the values if we recognize this name
|
|
28
|
-
send(@name.to_sym)
|
|
32
|
+
send("set_#{@name}".to_sym)
|
|
33
|
+
end
|
|
34
|
+
if block_given?
|
|
35
|
+
yield(self)
|
|
29
36
|
end
|
|
30
37
|
end
|
|
31
38
|
|
|
32
|
-
def
|
|
39
|
+
def set_trypsin
|
|
33
40
|
@sense = 'C'
|
|
34
41
|
@cut = 'KR'
|
|
35
42
|
@no_cut = 'P'
|
|
@@ -41,10 +48,26 @@ class SampleEnzyme
|
|
|
41
48
|
end
|
|
42
49
|
end
|
|
43
50
|
|
|
51
|
+
# returns self
|
|
52
|
+
def from_pepxml_node(node)
|
|
53
|
+
self.name = node['name']
|
|
54
|
+
ch = node.child
|
|
55
|
+
self.cut = ch['cut']
|
|
56
|
+
self.no_cut= ch['no_cut']
|
|
57
|
+
self.sense = ch['sense']
|
|
58
|
+
self
|
|
59
|
+
end
|
|
60
|
+
|
|
61
|
+
def self.from_pepxml_node(node)
|
|
62
|
+
self.new.from_pepxml_node(node)
|
|
63
|
+
end
|
|
64
|
+
|
|
44
65
|
# returns all peptides of missed cleavages <= 'missed_cleavages'
|
|
45
66
|
# so 2 missed cleavages will return all no missed cleavage peptides
|
|
46
67
|
# all 1 missed cleavages and all 2 missed cleavages.
|
|
47
|
-
|
|
68
|
+
# options:
|
|
69
|
+
def digest(string, missed_cleavages=0, options={})
|
|
70
|
+
raise NotImplementedError if @sense == 'N'
|
|
48
71
|
s = StringScanner.new(string)
|
|
49
72
|
no_cut_regex = Regexp.new("[#{@no_cut}]")
|
|
50
73
|
regex = Regexp.new("[#{@cut}]")
|
|
@@ -75,7 +98,7 @@ class SampleEnzyme
|
|
|
75
98
|
end
|
|
76
99
|
## LOOP through and grab each set of missed cleavages from num down to 0
|
|
77
100
|
all_sets_of_peps = []
|
|
78
|
-
(0..missed_cleavages).to_a.reverse.
|
|
101
|
+
(0..missed_cleavages).to_a.reverse.each do |num_mc|
|
|
79
102
|
all_sets_of_peps.push( *(get_missed_cleavages(peps, num_mc)) )
|
|
80
103
|
end
|
|
81
104
|
all_sets_of_peps
|
|
@@ -85,9 +108,9 @@ class SampleEnzyme
|
|
|
85
108
|
# cleavages
|
|
86
109
|
# DOES NOT contain peptides that contain < num of missed cleavages
|
|
87
110
|
# (i.e., will not return missed cleaveages of 1 or 2 if num == 3
|
|
88
|
-
def get_missed_cleavages(
|
|
89
|
-
(0...(
|
|
90
|
-
|
|
111
|
+
def get_missed_cleavages(ar_of_peptide_seqs, num)
|
|
112
|
+
(0...(ar_of_peptide_seqs.size - num)).to_a.map do |i|
|
|
113
|
+
ar_of_peptide_seqs[i,num+1].join
|
|
91
114
|
end
|
|
92
115
|
end
|
|
93
116
|
|
data/lib/scan_i.rb
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
|
|
2
|
+
# http://groups.google.com/group/comp.lang.ruby/browse_thread/thread/7370f94e852c0fae/4068c8c1c1c158ee
|
|
3
|
+
class String
|
|
4
|
+
def scan_i seq
|
|
5
|
+
pos=0
|
|
6
|
+
ndx=[]
|
|
7
|
+
slen = seq.length
|
|
8
|
+
while i=index(seq,pos)
|
|
9
|
+
ndx << i
|
|
10
|
+
pos = i + slen
|
|
11
|
+
end
|
|
12
|
+
ndx
|
|
13
|
+
end
|
|
14
|
+
|
|
15
|
+
#def scan_enum seq
|
|
16
|
+
# self.enum_for(:scan, seq).map do
|
|
17
|
+
# $~.offset(0)[0]
|
|
18
|
+
# end
|
|
19
|
+
#end
|
|
20
|
+
end
|
|
21
|
+
|
data/lib/spec_id/aa_freqs.rb
CHANGED
|
@@ -9,9 +9,10 @@ class SpecID::AAFreqs
|
|
|
9
9
|
# seeing that amino acid. Frequencies should add to 1.
|
|
10
10
|
attr_accessor :aafreqs
|
|
11
11
|
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
|
|
12
|
+
# fasta is fasta object!
|
|
13
|
+
def initialize(fasta=nil)
|
|
14
|
+
@fasta = fasta
|
|
15
|
+
if @fasta
|
|
15
16
|
@aafreqs = calculate_frequencies(@fasta)
|
|
16
17
|
end
|
|
17
18
|
end
|
|
@@ -64,6 +65,9 @@ class SpecID::AAFreqs
|
|
|
64
65
|
# returns two numbers in array [actual, expected]
|
|
65
66
|
# expected is a Float!!!
|
|
66
67
|
def actual_and_expected_number(peptide_aaseqs, amino_acid=:C, at_least=1)
|
|
68
|
+
if at_least > 1
|
|
69
|
+
raise NotImplementedError, "can only do at_least=1 right now!"
|
|
70
|
+
end
|
|
67
71
|
one_minus_freq = 1.0 - @aafreqs[amino_acid.to_sym]
|
|
68
72
|
amino_acid_as_st = amino_acid.to_s
|
|
69
73
|
probs = []
|