mspire 0.2.4 → 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/INSTALL +1 -0
- data/README +25 -0
- data/Rakefile +129 -40
- data/bin/{find_aa_freq.rb → aafreqs.rb} +2 -2
- data/bin/bioworks_to_pepxml.rb +1 -0
- data/bin/fasta_shaker.rb +1 -96
- data/bin/filter_and_validate.rb +5 -0
- data/bin/{mzxml_to_lmat.rb → ms_to_lmat.rb} +8 -7
- data/bin/prob_validate.rb +6 -0
- data/bin/raw_to_mzXML.rb +2 -2
- data/bin/srf_group.rb +1 -0
- data/bin/srf_to_sqt.rb +40 -0
- data/changelog.txt +68 -0
- data/lib/align/chams.rb +6 -6
- data/lib/align.rb +4 -3
- data/lib/bsearch.rb +120 -0
- data/lib/fasta.rb +318 -86
- data/lib/group_by.rb +10 -0
- data/lib/index_by.rb +11 -0
- data/lib/merge_deep.rb +21 -0
- data/lib/{spec → ms/converter}/mzxml.rb +77 -109
- data/lib/ms/gradient_program.rb +171 -0
- data/lib/ms/msrun.rb +209 -0
- data/lib/{spec/msrun.rb → ms/msrun_index.rb} +7 -40
- data/lib/ms/parser/mzdata/axml.rb +12 -0
- data/lib/ms/parser/mzdata/dom.rb +160 -0
- data/lib/ms/parser/mzdata/libxml.rb +7 -0
- data/lib/ms/parser/mzdata.rb +25 -0
- data/lib/ms/parser/mzxml/axml.rb +11 -0
- data/lib/ms/parser/mzxml/dom.rb +159 -0
- data/lib/ms/parser/mzxml/hpricot.rb +253 -0
- data/lib/ms/parser/mzxml/libxml.rb +15 -0
- data/lib/ms/parser/mzxml/regexp.rb +122 -0
- data/lib/ms/parser/mzxml/rexml.rb +72 -0
- data/lib/ms/parser/mzxml/xmlparser.rb +248 -0
- data/lib/ms/parser/mzxml.rb +175 -0
- data/lib/ms/parser.rb +108 -0
- data/lib/ms/precursor.rb +10 -0
- data/lib/ms/scan.rb +81 -0
- data/lib/ms/spectrum.rb +193 -0
- data/lib/ms.rb +10 -0
- data/lib/mspire.rb +4 -0
- data/lib/roc.rb +61 -1
- data/lib/sample_enzyme.rb +31 -8
- data/lib/scan_i.rb +21 -0
- data/lib/spec_id/aa_freqs.rb +7 -3
- data/lib/spec_id/bioworks.rb +20 -14
- data/lib/spec_id/digestor.rb +139 -0
- data/lib/spec_id/mass.rb +116 -0
- data/lib/spec_id/parser/proph.rb +236 -0
- data/lib/spec_id/precision/filter/cmdline.rb +209 -0
- data/lib/spec_id/precision/filter/interactive.rb +134 -0
- data/lib/spec_id/precision/filter/output.rb +147 -0
- data/lib/spec_id/precision/filter.rb +623 -0
- data/lib/spec_id/precision/output.rb +60 -0
- data/lib/spec_id/precision/prob/cmdline.rb +139 -0
- data/lib/spec_id/precision/prob/output.rb +88 -0
- data/lib/spec_id/precision/prob.rb +171 -0
- data/lib/spec_id/proph/pep_summary.rb +92 -0
- data/lib/spec_id/proph/prot_summary.rb +484 -0
- data/lib/spec_id/proph.rb +2 -466
- data/lib/spec_id/protein_summary.rb +2 -2
- data/lib/spec_id/sequest/params.rb +316 -0
- data/lib/spec_id/sequest/pepxml.rb +1513 -0
- data/lib/spec_id/sequest.rb +2 -1672
- data/lib/spec_id/srf.rb +445 -177
- data/lib/spec_id.rb +183 -95
- data/lib/spec_id_xml.rb +8 -10
- data/lib/transmem/phobius.rb +147 -0
- data/lib/transmem/toppred.rb +368 -0
- data/lib/transmem.rb +157 -0
- data/lib/validator/aa.rb +135 -0
- data/lib/validator/background.rb +73 -0
- data/lib/validator/bias.rb +95 -0
- data/lib/validator/cmdline.rb +260 -0
- data/lib/validator/decoy.rb +94 -0
- data/lib/validator/digestion_based.rb +69 -0
- data/lib/validator/probability.rb +48 -0
- data/lib/validator/prot_from_pep.rb +234 -0
- data/lib/validator/transmem.rb +272 -0
- data/lib/validator/true_pos.rb +46 -0
- data/lib/validator.rb +214 -0
- data/lib/xml.rb +38 -0
- data/lib/xml_style_parser.rb +105 -0
- data/lib/xmlparser_wrapper.rb +19 -0
- data/script/compile_and_plot_smriti_final.rb +97 -0
- data/script/extract_gradient_programs.rb +56 -0
- data/script/get_apex_values_rexml.rb +44 -0
- data/script/mzXML2timeIndex.rb +1 -1
- data/script/smriti_final_analysis.rb +103 -0
- data/script/toppred_to_yaml.rb +47 -0
- data/script/tpp_installer.rb +1 -1
- data/{test/tc_align.rb → specs/align_spec.rb} +21 -27
- data/{test/tc_bioworks_to_pepxml.rb → specs/bin/bioworks_to_pepxml_spec.rb} +25 -41
- data/specs/bin/fasta_shaker_spec.rb +259 -0
- data/specs/bin/filter_and_validate__multiple_vals_helper.yaml +202 -0
- data/specs/bin/filter_and_validate_spec.rb +124 -0
- data/specs/bin/ms_to_lmat_spec.rb +34 -0
- data/specs/bin/prob_validate_spec.rb +62 -0
- data/specs/bin/protein_summary_spec.rb +10 -0
- data/{test/tc_fasta.rb → specs/fasta_spec.rb} +354 -310
- data/specs/gi_spec.rb +22 -0
- data/specs/load_bin_path.rb +7 -0
- data/specs/merge_deep_spec.rb +13 -0
- data/specs/ms/gradient_program_spec.rb +77 -0
- data/specs/ms/msrun_spec.rb +455 -0
- data/specs/ms/parser_spec.rb +92 -0
- data/specs/ms/spectrum_spec.rb +89 -0
- data/specs/roc_spec.rb +251 -0
- data/specs/rspec_autotest.rb +149 -0
- data/specs/sample_enzyme_spec.rb +41 -0
- data/specs/spec_helper.rb +133 -0
- data/specs/spec_id/aa_freqs_spec.rb +52 -0
- data/{test/tc_bioworks.rb → specs/spec_id/bioworks_spec.rb} +56 -71
- data/specs/spec_id/digestor_spec.rb +75 -0
- data/specs/spec_id/precision/filter/cmdline_spec.rb +20 -0
- data/specs/spec_id/precision/filter/output_spec.rb +31 -0
- data/specs/spec_id/precision/filter_spec.rb +243 -0
- data/specs/spec_id/precision/prob_spec.rb +111 -0
- data/specs/spec_id/precision/prob_spec_helper.rb +0 -0
- data/specs/spec_id/proph/pep_summary_spec.rb +143 -0
- data/{test/tc_proph.rb → specs/spec_id/proph/prot_summary_spec.rb} +52 -32
- data/{test/tc_protein_summary.rb → specs/spec_id/protein_summary_spec.rb} +85 -0
- data/specs/spec_id/sequest/params_spec.rb +68 -0
- data/specs/spec_id/sequest/pepxml_spec.rb +452 -0
- data/specs/spec_id/sqt_spec.rb +138 -0
- data/specs/spec_id/srf_spec.rb +209 -0
- data/specs/spec_id/srf_spec_helper.rb +302 -0
- data/specs/spec_id_helper.rb +33 -0
- data/specs/spec_id_spec.rb +361 -0
- data/specs/spec_id_xml_spec.rb +33 -0
- data/specs/transmem/phobius_spec.rb +423 -0
- data/specs/transmem/toppred_spec.rb +297 -0
- data/specs/transmem_spec.rb +60 -0
- data/specs/transmem_spec_shared.rb +64 -0
- data/specs/validator/aa_spec.rb +107 -0
- data/specs/validator/background_spec.rb +51 -0
- data/specs/validator/bias_spec.rb +146 -0
- data/specs/validator/decoy_spec.rb +51 -0
- data/specs/validator/fasta_helper.rb +26 -0
- data/specs/validator/prot_from_pep_spec.rb +141 -0
- data/specs/validator/transmem_spec.rb +145 -0
- data/specs/validator/true_pos_spec.rb +58 -0
- data/specs/validator_helper.rb +33 -0
- data/specs/xml_spec.rb +12 -0
- data/test_files/000_pepxml18_small.xml +206 -0
- data/test_files/020a.mzXML.timeIndex +4710 -0
- data/test_files/4-03-03_mzXML/000.mzXML.timeIndex +3973 -0
- data/test_files/4-03-03_mzXML/020.mzXML.timeIndex +3872 -0
- data/test_files/4-03-03_small-prot.xml +321 -0
- data/test_files/4-03-03_small.xml +3876 -0
- data/test_files/7MIX_STD_110802_1.sequest_params_fragment.srf +0 -0
- data/test_files/bioworks-3.3_10prots.xml +5999 -0
- data/test_files/bioworks31.params +77 -0
- data/test_files/bioworks32.params +62 -0
- data/test_files/bioworks33.params +63 -0
- data/test_files/bioworks_single_run_small.xml +7237 -0
- data/test_files/bioworks_small.fasta +212 -0
- data/test_files/bioworks_small.params +63 -0
- data/test_files/bioworks_small.phobius +109 -0
- data/test_files/bioworks_small.toppred.out +2847 -0
- data/test_files/bioworks_small.xml +5610 -0
- data/test_files/bioworks_with_INV_small.xml +3753 -0
- data/test_files/bioworks_with_SHUFF_small.xml +2503 -0
- data/test_files/corrupted_900.srf +0 -0
- data/test_files/head_of_7MIX.srf +0 -0
- data/test_files/interact-opd1_mods_small-prot.xml +304 -0
- data/test_files/messups.fasta +297 -0
- data/test_files/opd1/000.my_answer.100lines.xml +101 -0
- data/test_files/opd1/000.tpp_1.2.3.first10.xml +115 -0
- data/test_files/opd1/000.tpp_2.9.2.first10.xml +126 -0
- data/test_files/opd1/000.v2.1.mzXML.timeIndex +3748 -0
- data/test_files/opd1/000_020-prot.png +0 -0
- data/test_files/opd1/000_020_3prots-prot.mod_initprob.xml +62 -0
- data/test_files/opd1/000_020_3prots-prot.xml +62 -0
- data/test_files/opd1/opd1_cat_inv_small-prot.xml +139 -0
- data/test_files/opd1/sequest.3.1.params +77 -0
- data/test_files/opd1/sequest.3.2.params +62 -0
- data/test_files/opd1/twenty_scans.mzXML +418 -0
- data/test_files/opd1/twenty_scans.v2.1.mzXML +382 -0
- data/test_files/opd1/twenty_scans_answ.lmat +0 -0
- data/test_files/opd1/twenty_scans_answ.lmata +9 -0
- data/test_files/opd1_020_beginning.RAW +0 -0
- data/test_files/opd1_2runs_2mods/interact-opd1_mods__small.xml +753 -0
- data/test_files/orbitrap_mzData/000_cut.xml +1920 -0
- data/test_files/pepproph_small.xml +4691 -0
- data/test_files/phobius.small.noheader.txt +50 -0
- data/test_files/phobius.small.small.txt +53 -0
- data/test_files/s01_anC1_ld020mM.key.txt +25 -0
- data/test_files/s01_anC1_ld020mM.meth +0 -0
- data/test_files/small.fasta +297 -0
- data/test_files/smallraw.RAW +0 -0
- data/test_files/tf_bioworks2excel.bioXML +14340 -0
- data/test_files/tf_bioworks2excel.txt.actual +1035 -0
- data/test_files/toppred.small.out +416 -0
- data/test_files/toppred.xml.out +318 -0
- data/test_files/validator_hits_separate/bias_bioworks_small_HS.fasta +7 -0
- data/test_files/validator_hits_separate/bioworks_small_HS.xml +5651 -0
- data/test_files/yeast_gly_small-prot.xml +265 -0
- data/test_files/yeast_gly_small.1.0_1.0_1.0.parentTimes +6 -0
- data/test_files/yeast_gly_small.xml +3807 -0
- data/test_files/yeast_gly_small2.parentTimes +6 -0
- metadata +273 -57
- data/bin/filter.rb +0 -6
- data/bin/precision.rb +0 -5
- data/lib/spec/mzdata/parser.rb +0 -108
- data/lib/spec/mzdata.rb +0 -48
- data/lib/spec/mzxml/parser.rb +0 -449
- data/lib/spec/scan.rb +0 -55
- data/lib/spec_id/filter.rb +0 -797
- data/lib/spec_id/precision.rb +0 -421
- data/lib/toppred.rb +0 -18
- data/script/filter-peps.rb +0 -164
- data/test/tc_aa_freqs.rb +0 -59
- data/test/tc_fasta_shaker.rb +0 -149
- data/test/tc_filter.rb +0 -203
- data/test/tc_filter_peps.rb +0 -46
- data/test/tc_gi.rb +0 -17
- data/test/tc_id_class_anal.rb +0 -70
- data/test/tc_id_precision.rb +0 -89
- data/test/tc_msrun.rb +0 -88
- data/test/tc_mzxml.rb +0 -88
- data/test/tc_mzxml_to_lmat.rb +0 -36
- data/test/tc_peptide_parent_times.rb +0 -27
- data/test/tc_precision.rb +0 -60
- data/test/tc_roc.rb +0 -166
- data/test/tc_sample_enzyme.rb +0 -32
- data/test/tc_scan.rb +0 -26
- data/test/tc_sequest.rb +0 -336
- data/test/tc_spec.rb +0 -78
- data/test/tc_spec_id.rb +0 -201
- data/test/tc_spec_id_xml.rb +0 -36
- data/test/tc_srf.rb +0 -262
data/lib/spec_id/srf.rb
CHANGED
|
@@ -1,7 +1,12 @@
|
|
|
1
|
+
require 'spec_id'
|
|
1
2
|
require 'spec_id/sequest'
|
|
3
|
+
require 'fasta'
|
|
4
|
+
require 'mspire'
|
|
5
|
+
require 'set'
|
|
6
|
+
require 'fasta'
|
|
2
7
|
|
|
3
8
|
module BinaryReader
|
|
4
|
-
Null_char = "\0"[0] ## change for ruby 1.9 or 2.0
|
|
9
|
+
Null_char = "\0"[0] ## TODO: change for ruby 1.9 or 2.0
|
|
5
10
|
# extracts a string with all empty chars at the end stripped
|
|
6
11
|
# expects the filehandle to be at the proper location
|
|
7
12
|
def get_null_padded_string(fh,bytes)
|
|
@@ -16,62 +21,6 @@ module BinaryReader
|
|
|
16
21
|
end
|
|
17
22
|
|
|
18
23
|
# class to extract information from <file>_dta.log files
|
|
19
|
-
class DTALog
|
|
20
|
-
# returns an array indexed by the dta file number (starting at 0)
|
|
21
|
-
# each entry is an array [first_scan, last_scan, dta_filename_noext]
|
|
22
|
-
# this is now obsolete since I found the scan # index at the end of the srf
|
|
23
|
-
# files
|
|
24
|
-
def self.dta_and_scans_by_dta_index(file)
|
|
25
|
-
dta_index = nil
|
|
26
|
-
final_scan = nil
|
|
27
|
-
dta_cnt = 0
|
|
28
|
-
re = /^ m/o
|
|
29
|
-
scan_line_re = /scan: (\d+) - (\d+), Datafile: (.*?) (.*)/o
|
|
30
|
-
other_dta_re = /Datafile: (.*?) /o
|
|
31
|
-
File.open(file) do |fh|
|
|
32
|
-
10.times { fh.readline }
|
|
33
|
-
scan_range_line = fh.readline
|
|
34
|
-
if scan_range_line =~ /scan range\s+= \d+ - (\d+)/
|
|
35
|
-
# this is an overestimate (since MS scans have no dta, but that's OK)
|
|
36
|
-
dta_index = Array.new($1.to_i)
|
|
37
|
-
else
|
|
38
|
-
dta_index = []
|
|
39
|
-
end
|
|
40
|
-
3.times { fh.readline }
|
|
41
|
-
fh.each do |line|
|
|
42
|
-
if line =~ re
|
|
43
|
-
if line =~ scan_line_re
|
|
44
|
-
first_scan = $1.to_i
|
|
45
|
-
last_scan = $2.to_i
|
|
46
|
-
the_rest = $4.dup
|
|
47
|
-
dta_index[dta_cnt] = [first_scan, last_scan, $3.sub(/\.dta/,'')]
|
|
48
|
-
dta_cnt += 1
|
|
49
|
-
if the_rest =~ other_dta_re
|
|
50
|
-
dta_index[dta_cnt] = [first_scan, last_scan, $1.sub(/\.dta/,'')]
|
|
51
|
-
dta_cnt += 1
|
|
52
|
-
end
|
|
53
|
-
end
|
|
54
|
-
break
|
|
55
|
-
end
|
|
56
|
-
end
|
|
57
|
-
fh.each do |line|
|
|
58
|
-
if line =~ scan_line_re
|
|
59
|
-
first_scan = $1.to_i
|
|
60
|
-
last_scan = $2.to_i
|
|
61
|
-
the_rest = $4.dup
|
|
62
|
-
dta_index[dta_cnt] = [first_scan, last_scan, $3.sub(/\.dta/,'')]
|
|
63
|
-
dta_cnt += 1
|
|
64
|
-
if the_rest =~ other_dta_re
|
|
65
|
-
dta_index[dta_cnt] = [first_scan, last_scan, $1.sub(/\.dta/,'')]
|
|
66
|
-
dta_cnt += 1
|
|
67
|
-
end
|
|
68
|
-
end
|
|
69
|
-
end
|
|
70
|
-
end
|
|
71
|
-
dta_index.compact! # remove those trailing nils
|
|
72
|
-
dta_index
|
|
73
|
-
end
|
|
74
|
-
end
|
|
75
24
|
|
|
76
25
|
class SRFGroup
|
|
77
26
|
include SpecID
|
|
@@ -83,12 +32,15 @@ class SRFGroup
|
|
|
83
32
|
# takes an array of filenames
|
|
84
33
|
# or a single .srg filename
|
|
85
34
|
# see from_srg to load a single .srg file
|
|
86
|
-
|
|
35
|
+
# by default, the hits will be returned filtered by sequest params values.
|
|
36
|
+
# [The raw SRF data is unfiltered!]
|
|
37
|
+
def initialize(filenames=nil, filter_hits_by_params=true)
|
|
87
38
|
@filenames = filenames
|
|
88
39
|
@peps = []
|
|
89
40
|
@prots = []
|
|
90
|
-
@global_ref_hash = {}
|
|
91
41
|
@srfs = []
|
|
42
|
+
|
|
43
|
+
global_ref_hash = {}
|
|
92
44
|
if filenames
|
|
93
45
|
if filenames.is_a?(String) && filenames =~ /\.srg$/
|
|
94
46
|
srg_filename = filenames.dup
|
|
@@ -103,16 +55,99 @@ class SRFGroup
|
|
|
103
55
|
end
|
|
104
56
|
end
|
|
105
57
|
filenames.each do |file|
|
|
106
|
-
@srfs << SRF.new(file, @peps,
|
|
58
|
+
@srfs << SRF.new(file, @peps, global_ref_hash)
|
|
59
|
+
end
|
|
60
|
+
@prots = global_ref_hash.values
|
|
61
|
+
if filter_hits_by_params
|
|
62
|
+
filter_by_peptide_mass_tolerance
|
|
63
|
+
end
|
|
64
|
+
end
|
|
65
|
+
end
|
|
66
|
+
|
|
67
|
+
# if srfs were read in separately, then the proteins will need to be merged
|
|
68
|
+
# by their reference
|
|
69
|
+
def merge_different_sets(srfs)
|
|
70
|
+
raise NotImplementedError, "need to implement?"
|
|
71
|
+
end
|
|
72
|
+
|
|
73
|
+
# 1. sets @prots and returns it: a new list of proteins based on which
|
|
74
|
+
# peptides passed.
|
|
75
|
+
# 2. updates the out_file's list of hits based on passing peptides (but not
|
|
76
|
+
# the original hit id; rank is implicit in array ordering)
|
|
77
|
+
# 3. updates each protein to only include peptides passing thresholds.
|
|
78
|
+
# [Note, this process is how .out files are generated!]
|
|
79
|
+
# 4. recalculates deltacn values completely if number of hits changed (does
|
|
80
|
+
# not touch deltacn orig)
|
|
81
|
+
# ASSUMES:
|
|
82
|
+
# A. all srfs have identical params objects and each has a
|
|
83
|
+
# peptide_mass_tolerance filter attribute.
|
|
84
|
+
# B. proteins are already unique (peptides referencing the same protein
|
|
85
|
+
# reference the same object already) In practice, this means all srfs were
|
|
86
|
+
# read in together.
|
|
87
|
+
def filter_by_peptide_mass_tolerance
|
|
88
|
+
prots_in_set = Set.new
|
|
89
|
+
params = @srfs.first.params
|
|
90
|
+
pmt = params.peptide_mass_tolerance.to_f
|
|
91
|
+
methd = nil # the method to
|
|
92
|
+
|
|
93
|
+
case params.peptide_mass_units
|
|
94
|
+
when '0'
|
|
95
|
+
amu_based = true
|
|
96
|
+
milli_amu = false
|
|
97
|
+
when '1'
|
|
98
|
+
amu_based = true
|
|
99
|
+
milli_amu = true
|
|
100
|
+
when '2'
|
|
101
|
+
amu_based = false
|
|
102
|
+
end
|
|
103
|
+
|
|
104
|
+
@srfs.each do |srf|
|
|
105
|
+
srf.out_files.each do |out_file|
|
|
106
|
+
hits = out_file.hits
|
|
107
|
+
before = hits.size
|
|
108
|
+
hits.reject! do |pep|
|
|
109
|
+
do_not_keep =
|
|
110
|
+
if amu_based
|
|
111
|
+
if milli_amu
|
|
112
|
+
(pep.deltamass.abs > (pmt/1000))
|
|
113
|
+
else
|
|
114
|
+
(pep.deltamass.abs > pmt)
|
|
115
|
+
end
|
|
116
|
+
else
|
|
117
|
+
(pep.ppm.abs > pmt)
|
|
118
|
+
end
|
|
119
|
+
unless do_not_keep
|
|
120
|
+
pep.prots.each do |prot|
|
|
121
|
+
if prots_in_set.include?(prot)
|
|
122
|
+
prot.peps << pep
|
|
123
|
+
else
|
|
124
|
+
prots_in_set.add(prot)
|
|
125
|
+
prot.peps = [pep]
|
|
126
|
+
end
|
|
127
|
+
end
|
|
128
|
+
end
|
|
129
|
+
do_not_keep
|
|
130
|
+
end
|
|
131
|
+
if hits.size != before
|
|
132
|
+
SRF::OUT::Pep.set_deltacn_from_xcorr(hits)
|
|
133
|
+
end
|
|
107
134
|
end
|
|
108
135
|
end
|
|
136
|
+
@prots = prots_in_set.to_a
|
|
137
|
+
|
|
109
138
|
end
|
|
110
139
|
|
|
111
140
|
# returns the filename used
|
|
141
|
+
# if the file exists, the name will be expanded to full path, otherwise just
|
|
142
|
+
# what is given
|
|
112
143
|
def to_srg(srg_filename='bioworks.srg')
|
|
113
144
|
File.open(srg_filename, 'w') do |v|
|
|
114
145
|
@filenames.each do |srf_file|
|
|
115
|
-
|
|
146
|
+
if File.exist? srf_file
|
|
147
|
+
v.puts File.expand_path(srf_file)
|
|
148
|
+
else
|
|
149
|
+
v.puts srf_file
|
|
150
|
+
end
|
|
116
151
|
end
|
|
117
152
|
end
|
|
118
153
|
srg_filename
|
|
@@ -132,15 +167,25 @@ class SRF
|
|
|
132
167
|
# [first_scan, last_scan, charge]
|
|
133
168
|
attr_accessor :index
|
|
134
169
|
attr_accessor :base_name
|
|
170
|
+
# this is the global peptides array
|
|
171
|
+
attr_accessor :peps
|
|
172
|
+
# the global reference hash that allows...
|
|
173
|
+
attr_accessor :global_ref_hash
|
|
135
174
|
|
|
136
175
|
def dta_start_byte
|
|
137
176
|
case @version
|
|
138
177
|
when '3.2' ; 3260
|
|
139
178
|
when '3.3' ; 3644
|
|
179
|
+
when '3.5' ; 3644
|
|
140
180
|
end
|
|
141
181
|
end
|
|
142
182
|
|
|
143
|
-
# peps and
|
|
183
|
+
# peps and global_ref_hash are created as the srf files is read. If the
|
|
184
|
+
# file is read as part of a group, then these should be passed in.
|
|
185
|
+
# NOTE: if you want the hits filtered by precursor tolerance (the way they
|
|
186
|
+
# might be displayed in .out files) you should probably use SRFGroup (which
|
|
187
|
+
# does this by default)
|
|
188
|
+
# SRF is meant to be a low level read of the file.
|
|
144
189
|
def initialize(filename=nil, peps=[], global_ref_hash={})
|
|
145
190
|
@dta_files = []
|
|
146
191
|
@out_files = []
|
|
@@ -149,17 +194,203 @@ class SRF
|
|
|
149
194
|
end
|
|
150
195
|
end
|
|
151
196
|
|
|
197
|
+
def round(float, decimal_places)
|
|
198
|
+
sprintf("%.#{decimal_places}f", float)
|
|
199
|
+
end
|
|
200
|
+
|
|
201
|
+
# the out_filename will be the base_name + .sqt unless 'out_filename' is
|
|
202
|
+
# defined
|
|
203
|
+
# :round => round floating point numbers
|
|
204
|
+
# etc...
|
|
205
|
+
def to_sqt(out_filename=nil, opts={})
|
|
206
|
+
tic_dp = 2
|
|
207
|
+
mh_dp = 7
|
|
208
|
+
xcorr_dp = 5
|
|
209
|
+
sp_dp = 2
|
|
210
|
+
dcn_dp = 5
|
|
211
|
+
|
|
212
|
+
defaults = {:db_info=>false, :new_db_path=>nil, :update_db_path=>false, :round=>false}
|
|
213
|
+
opt = defaults.merge(opts)
|
|
214
|
+
|
|
215
|
+
outfile =
|
|
216
|
+
if out_filename
|
|
217
|
+
out_filename
|
|
218
|
+
else
|
|
219
|
+
base_name + '.sqt'
|
|
220
|
+
end
|
|
221
|
+
invariant_ordering = %w(SQTGenerator SQTGeneratorVersion Database FragmentMasses PrecursorMasses StartTime) # just for readability and consistency
|
|
222
|
+
fmt =
|
|
223
|
+
if params.fragment_mass_type == 'average' ; 'AVG'
|
|
224
|
+
else ; 'MONO'
|
|
225
|
+
end
|
|
226
|
+
pmt =
|
|
227
|
+
if params.precursor_mass_type == 'average' ; 'AVG'
|
|
228
|
+
else ; 'MONO'
|
|
229
|
+
end
|
|
230
|
+
|
|
231
|
+
mass_table = params.mass_table
|
|
232
|
+
static_mods = params.static_mods.map do |k,v|
|
|
233
|
+
key = k.split(/_/)[1]
|
|
234
|
+
if key.size == 1
|
|
235
|
+
key + '=' + (mass_table[key.to_sym] + v.to_f).to_s
|
|
236
|
+
else
|
|
237
|
+
key + '=' + v
|
|
238
|
+
end
|
|
239
|
+
end
|
|
240
|
+
|
|
241
|
+
dynamic_mods = []
|
|
242
|
+
header.modifications.scan(/\((.*?)\)/) do |match|
|
|
243
|
+
dynamic_mods << match.first.sub(/ /,'=')
|
|
244
|
+
end
|
|
245
|
+
plural = {
|
|
246
|
+
'StaticMod' => static_mods,
|
|
247
|
+
'DynamicMod' => dynamic_mods, # example as diff mod
|
|
248
|
+
'Comment' => ['Created from Bioworks .srf file']
|
|
249
|
+
}
|
|
250
|
+
|
|
251
|
+
|
|
252
|
+
db_filename = header.db_filename
|
|
253
|
+
db_filename_in_sqt = db_filename
|
|
254
|
+
if opt[:new_db_path]
|
|
255
|
+
db_filename = File.join(opt[:new_db_path], File.basename(db_filename.gsub('\\', '/')))
|
|
256
|
+
if opt[:update_db_path]
|
|
257
|
+
db_filename_in_sqt = File.expand_path(db_filename)
|
|
258
|
+
warn "writing Database #{db_filename} to sqt, but it does not exist on this file system" unless File.exist?(db_filename)
|
|
259
|
+
end
|
|
260
|
+
end
|
|
261
|
+
|
|
262
|
+
apmu =
|
|
263
|
+
case params.peptide_mass_units
|
|
264
|
+
when '0' : 'amu'
|
|
265
|
+
when '1' : 'mmu'
|
|
266
|
+
when '2' : 'ppm'
|
|
267
|
+
end
|
|
268
|
+
|
|
269
|
+
hh = {
|
|
270
|
+
'SQTGenerator' => 'mspire',
|
|
271
|
+
'SQTGeneratorVersion' => Mspire::Version,
|
|
272
|
+
'Database' => db_filename_in_sqt,
|
|
273
|
+
'FragmentMasses' => fmt,
|
|
274
|
+
'PrecursorMasses' => pmt,
|
|
275
|
+
'StartTime' => '', # Bioworks 3.2 also leaves this blank...
|
|
276
|
+
'Alg-PreMassTol' => params.peptide_mass_tolerance,
|
|
277
|
+
'Alg-FragMassTol' => params.fragment_ion_tolerance,
|
|
278
|
+
'Alg-PreMassUnits' => apmu, ## mine
|
|
279
|
+
'Alg-IonSeries' => header.ion_series.split(':').last.lstrip,
|
|
280
|
+
'Alg-Enzyme' => header.enzyme.split(':').last,
|
|
281
|
+
'Alg-MSModel' => header.model,
|
|
282
|
+
}
|
|
283
|
+
|
|
284
|
+
if opt[:db_info]
|
|
285
|
+
if File.exist?(db_filename)
|
|
286
|
+
reply = get_db_info_for_sqt(db_filename)
|
|
287
|
+
%w(DBSeqLength DBLocusCount DBMD5Sum).zip(reply) do |label,val|
|
|
288
|
+
hh[label] = val
|
|
289
|
+
end
|
|
290
|
+
else
|
|
291
|
+
warn "file #{db_filename} does not exist, no extra db info in header!"
|
|
292
|
+
end
|
|
293
|
+
end
|
|
294
|
+
|
|
295
|
+
has_hits = (self.out_files.size > 0)
|
|
296
|
+
if has_hits
|
|
297
|
+
# somewhat redundant with above, but we can get this without a db present!
|
|
298
|
+
hh['DBLocusCount'] = self.out_files.first.db_locus_count
|
|
299
|
+
end
|
|
300
|
+
|
|
301
|
+
File.open(outfile, 'w') do |out|
|
|
302
|
+
# print the header:
|
|
303
|
+
invariant_ordering.each do |iv|
|
|
304
|
+
out.puts ['H', iv, hh.delete(iv)].join("\t")
|
|
305
|
+
end
|
|
306
|
+
hh.each do |k,v|
|
|
307
|
+
out.puts ['H', k, v].join("\t")
|
|
308
|
+
end
|
|
309
|
+
plural.each do |k,vals|
|
|
310
|
+
vals.each do |val|
|
|
311
|
+
out.puts ['H', k, val].join("\t")
|
|
312
|
+
end
|
|
313
|
+
end
|
|
314
|
+
|
|
315
|
+
##### SPECTRA
|
|
316
|
+
time_to_process = '0.0'
|
|
317
|
+
#########################################
|
|
318
|
+
# NEED TO FIGURE OUT: (in spectra guy)
|
|
319
|
+
# * Lowest Sp value for top 500 spectra
|
|
320
|
+
# * Number of sequences matching this precursor ion
|
|
321
|
+
#########################################
|
|
322
|
+
|
|
323
|
+
|
|
324
|
+
manual_validation_status = 'U'
|
|
325
|
+
self.out_files.zip(dta_files) do |out_file, dta_file|
|
|
326
|
+
# don't have the time to process (using 0.0 like bioworks 3.2)
|
|
327
|
+
dta_file_mh = dta_file.mh
|
|
328
|
+
out_file_total_inten = out_file.total_inten
|
|
329
|
+
out_file_lowest_sp = out_file.lowest_sp
|
|
330
|
+
if opt[:round]
|
|
331
|
+
dta_file_mh = round(dta_file_mh, mh_dp)
|
|
332
|
+
out_file_total_inten = round(out_file_total_inten, tic_dp)
|
|
333
|
+
out_file_lowest_sp = round(out_file_lowest_sp, sp_dp)
|
|
334
|
+
end
|
|
335
|
+
|
|
336
|
+
out.puts ['S', out_file.first_scan, out_file.last_scan, out_file.charge, time_to_process, out_file.computer, dta_file_mh, out_file_total_inten, out_file_lowest_sp, out_file.num_matched_peptides].join("\t")
|
|
337
|
+
out_file.hits.each_with_index do |hit,index|
|
|
338
|
+
hit_mh = hit.mh
|
|
339
|
+
hit_deltacn_orig = hit.deltacn_orig
|
|
340
|
+
hit_xcorr = hit.xcorr
|
|
341
|
+
hit_sp = hit.sp
|
|
342
|
+
if opt[:round]
|
|
343
|
+
hit_mh = round(hit_mh, mh_dp)
|
|
344
|
+
hit_deltacn_orig = round(hit_deltacn_orig, dcn_dp)
|
|
345
|
+
hit_xcorr = round(hit_xcorr, xcorr_dp)
|
|
346
|
+
hit_sp = round(hit_sp, sp_dp)
|
|
347
|
+
end
|
|
348
|
+
# note that the rank is determined by the order..
|
|
349
|
+
out.puts ['M', index+1, hit.rsp, hit_mh, hit_deltacn_orig, hit_xcorr, hit_sp, hit.ions_matched, hit.ions_total, hit.sequence, manual_validation_status].join("\t")
|
|
350
|
+
hit.prots.each do |prot|
|
|
351
|
+
out.puts ['L', prot.first_entry].join("\t")
|
|
352
|
+
end
|
|
353
|
+
end
|
|
354
|
+
end
|
|
355
|
+
end # close the filehandle
|
|
356
|
+
|
|
357
|
+
end
|
|
358
|
+
|
|
359
|
+
# assumes the file exists and is readable
|
|
360
|
+
# returns [DBSeqLength, DBLocusCount, DBMD5Sum] or nil if no file
|
|
361
|
+
def get_db_info_for_sqt(dbfile)
|
|
362
|
+
fasta = Fasta.new(dbfile)
|
|
363
|
+
[fasta.aa_seq_length, fasta.size, fasta.md5_sum]
|
|
364
|
+
end
|
|
365
|
+
|
|
366
|
+
|
|
152
367
|
# returns self
|
|
153
368
|
def from_file(filename, peps, global_ref_hash)
|
|
154
369
|
|
|
155
370
|
File.open(filename, "rb") do |fh|
|
|
156
371
|
@header = SRF::Header.new.from_handle(fh)
|
|
157
372
|
@version = @header.version
|
|
158
|
-
|
|
159
|
-
|
|
160
|
-
|
|
161
|
-
|
|
162
|
-
|
|
373
|
+
unpack_35 = case @version
|
|
374
|
+
when '3.2'
|
|
375
|
+
false
|
|
376
|
+
when '3.3'
|
|
377
|
+
false
|
|
378
|
+
when '3.5'
|
|
379
|
+
true
|
|
380
|
+
end
|
|
381
|
+
@dta_files, measured_mhs = read_dta_files(fh,@header.num_dta_files, unpack_35)
|
|
382
|
+
|
|
383
|
+
@out_files = read_out_files(fh,@header.num_dta_files, global_ref_hash, measured_mhs, unpack_35)
|
|
384
|
+
if fh.eof?
|
|
385
|
+
warn "FILE: '#{filename}' appears to be an abortive run (no params in srf file)\nstill continuing..."
|
|
386
|
+
@params = nil
|
|
387
|
+
@index = []
|
|
388
|
+
else
|
|
389
|
+
@params = Sequest::Params.new.parse_handle(fh)
|
|
390
|
+
# This is very sensitive to the grab_params method in sequest params
|
|
391
|
+
fh.read(12) ## gap between last params entry and index
|
|
392
|
+
@index = read_scan_index(fh,@header.num_dta_files)
|
|
393
|
+
end
|
|
163
394
|
end
|
|
164
395
|
|
|
165
396
|
### UPDATE SOME THINGS ON SINGLE PASS:
|
|
@@ -168,14 +399,15 @@ class SRF
|
|
|
168
399
|
@index.each_with_index do |ind,i|
|
|
169
400
|
mass_measured = @dta_files[i][0]
|
|
170
401
|
#puts @out_files[i].join(", ")
|
|
171
|
-
|
|
402
|
+
@out_files[i][0,3] = *ind
|
|
403
|
+
pep_hits = @out_files[i][6]
|
|
172
404
|
peps.push( *pep_hits )
|
|
173
405
|
pep_hits.each do |pep_hit|
|
|
174
|
-
pep_hit[
|
|
406
|
+
pep_hit[14,4] = @base_name, *ind
|
|
175
407
|
# add the deltamass
|
|
176
|
-
pep_hit[
|
|
177
|
-
pep_hit[
|
|
178
|
-
pep_hit[
|
|
408
|
+
pep_hit[11] = pep_hit[0] - mass_measured # real - measured (deltamass)
|
|
409
|
+
pep_hit[12] = 1.0e6 * pep_hit[11].abs / mass_measured ## ppm
|
|
410
|
+
pep_hit[18] = self ## link with the srf object
|
|
179
411
|
end
|
|
180
412
|
end
|
|
181
413
|
self
|
|
@@ -195,27 +427,17 @@ class SRF
|
|
|
195
427
|
index
|
|
196
428
|
end
|
|
197
429
|
|
|
198
|
-
# given a zero indexed list where each entry is [first_scan, last_scan,
|
|
199
|
-
# dta_filename] updates the out info
|
|
200
|
-
# returns self
|
|
201
|
-
def update_out_scan_info_from_dta_log(dta_log)
|
|
202
|
-
index = DTALog.dta_and_scans_by_dta_index(dta_log)
|
|
203
|
-
@out_files.each_with_index do |ot,i|
|
|
204
|
-
ot[4,3] = index[i] #contingent on implementation of ot
|
|
205
|
-
end
|
|
206
|
-
self
|
|
207
|
-
end
|
|
208
|
-
|
|
209
430
|
# returns an array of dta_files
|
|
210
|
-
def read_dta_files(fh, num_files)
|
|
431
|
+
def read_dta_files(fh, num_files, unpack_35)
|
|
211
432
|
measured_mhs = Array.new(num_files) ## A parallel array to capture the actual mh
|
|
212
433
|
dta_files = Array.new(num_files)
|
|
213
434
|
start = dta_start_byte
|
|
214
435
|
unless fh.pos == start
|
|
215
436
|
fh.pos = start
|
|
216
437
|
end
|
|
438
|
+
|
|
217
439
|
header.num_dta_files.times do |i|
|
|
218
|
-
dta_file = SRF::DTA.new.from_handle(fh)
|
|
440
|
+
dta_file = SRF::DTA.new.from_handle(fh, unpack_35)
|
|
219
441
|
measured_mhs[i] = dta_file[0]
|
|
220
442
|
dta_files[i] = dta_file
|
|
221
443
|
end
|
|
@@ -224,10 +446,10 @@ class SRF
|
|
|
224
446
|
|
|
225
447
|
# filehandle (fh) must be at the start of the outfiles. 'read_dta_files'
|
|
226
448
|
# will put the fh there.
|
|
227
|
-
def read_out_files(fh,number_files, global_ref_hash, measured_mhs)
|
|
449
|
+
def read_out_files(fh,number_files, global_ref_hash, measured_mhs, unpack_35)
|
|
228
450
|
out_files = Array.new(number_files)
|
|
229
451
|
header.num_dta_files.times do |i|
|
|
230
|
-
out_files[i] = SRF::OUT.new.from_handle(fh, global_ref_hash)
|
|
452
|
+
out_files[i] = SRF::OUT.new.from_handle(fh, global_ref_hash, unpack_35)
|
|
231
453
|
end
|
|
232
454
|
out_files
|
|
233
455
|
end
|
|
@@ -326,31 +548,44 @@ class SRF::DTAGen
|
|
|
326
548
|
end
|
|
327
549
|
end
|
|
328
550
|
|
|
551
|
+
# total_num_possible_charge_states is not correct under 3.5 (Bioworks 3.3.1)
|
|
552
|
+
# unknown is, well unknown...
|
|
329
553
|
SRF::DTA = ArrayClass.new(%w(mh dta_tic num_peaks charge ms_level unknown total_num_possible_charge_states peaks))
|
|
330
554
|
|
|
331
555
|
class SRF::DTA
|
|
332
|
-
|
|
333
|
-
|
|
334
|
-
|
|
335
|
-
|
|
556
|
+
# original
|
|
557
|
+
# Unpack = "EeIvvvv"
|
|
558
|
+
Unpack_32 = "EeIvvvv"
|
|
559
|
+
Unpack_35 = "Ex8eVx2vvvv"
|
|
336
560
|
|
|
337
561
|
# note on peaks (self[7])
|
|
338
562
|
# this is a byte array of floats, you can get the peaks out with
|
|
339
563
|
# unpack("e*")
|
|
340
564
|
|
|
565
|
+
undef_method :inspect
|
|
341
566
|
def inspect
|
|
342
567
|
peaks_st = 'nil'
|
|
343
568
|
if self[7] ; peaks_st = "[#{self[7].size} bytes]" end
|
|
344
569
|
"<SRF::DTA @mh=#{mh} @dta_tic=#{dta_tic} @num_peaks=#{num_peaks} @charge=#{charge} @ms_level=#{ms_level} @total_num_possible_charge_states=#{total_num_possible_charge_states} @peaks=#{peaks_st} >"
|
|
345
570
|
end
|
|
346
571
|
|
|
347
|
-
def from_handle(fh)
|
|
348
|
-
|
|
572
|
+
def from_handle(fh, unpack_35)
|
|
573
|
+
if unpack_35
|
|
574
|
+
@unpack = Unpack_35
|
|
575
|
+
@read_header = 34
|
|
576
|
+
@read_spacer = 22
|
|
577
|
+
else
|
|
578
|
+
@unpack = Unpack_32
|
|
579
|
+
@read_header = 24
|
|
580
|
+
@read_spacer = 24
|
|
581
|
+
end
|
|
582
|
+
|
|
583
|
+
st = fh.read(@read_header)
|
|
349
584
|
# get the bulk of the data in single unpack
|
|
350
|
-
self[0,7] = st.unpack(
|
|
585
|
+
self[0,7] = st.unpack(@unpack)
|
|
351
586
|
|
|
352
|
-
# Scan numbers
|
|
353
|
-
st2 = fh.read(
|
|
587
|
+
# Scan numbers are given at the end in an index!
|
|
588
|
+
st2 = fh.read(@read_spacer)
|
|
354
589
|
|
|
355
590
|
num_bytes_to_read = num_peaks * 8
|
|
356
591
|
st3 = fh.read(num_bytes_to_read)
|
|
@@ -360,135 +595,176 @@ class SRF::DTA
|
|
|
360
595
|
|
|
361
596
|
end
|
|
362
597
|
|
|
363
|
-
SRF::OUT =
|
|
364
|
-
# 0=
|
|
598
|
+
SRF::OUT = ArrayClass.new( %w(first_scan last_scan charge num_hits computer date_time hits total_inten lowest_sp num_matched_peptides db_locus_count) )
|
|
599
|
+
# 0=first_scan, 1=last_scan, 2=charge, 3=num_hits, 4=computer, 5=date_time, 6=hits, 7=total_inten, 8=lowest_sp, 9=num_matched_peptides, 10=db_locus_count
|
|
365
600
|
|
|
366
601
|
class SRF::OUT
|
|
367
|
-
|
|
602
|
+
Unpack_32 = '@36vx2Z*@60Z*'
|
|
603
|
+
Unpack_35 = '@36vx4Z*@62Z*'
|
|
368
604
|
|
|
605
|
+
undef_method :inspect
|
|
369
606
|
def inspect
|
|
370
|
-
|
|
371
|
-
|
|
372
|
-
|
|
373
|
-
|
|
607
|
+
hits_s =
|
|
608
|
+
if self[6]
|
|
609
|
+
", @hits(#)=#{hits.size}"
|
|
610
|
+
else
|
|
611
|
+
''
|
|
612
|
+
end
|
|
613
|
+
"<SRF::OUT first_scan=#{first_scan}, last_scan=#{last_scan}, charge=#{charge}, num_hits=#{num_hits}, computer=#{computer}, date_time=#{date_time}#{hits_s}>"
|
|
374
614
|
end
|
|
375
615
|
|
|
376
|
-
def from_handle(fh, global_ref_hash)
|
|
616
|
+
def from_handle(fh, global_ref_hash, unpack_35)
|
|
377
617
|
## EMPTY out file is 96 bytes
|
|
378
618
|
## each hit is 320 bytes
|
|
379
619
|
## num_hits and charge:
|
|
380
620
|
st = fh.read(96)
|
|
381
|
-
|
|
382
|
-
|
|
621
|
+
|
|
622
|
+
self[3,3] = st.unpack( (unpack_35 ? Unpack_35 : Unpack_32) )
|
|
623
|
+
self[7,4] = st.unpack('@8eex4Ix4I')
|
|
624
|
+
num_hits = self[3]
|
|
383
625
|
|
|
384
626
|
ar = Array.new(num_hits)
|
|
385
627
|
if ar.size > 0
|
|
628
|
+
num_extra_references = 0
|
|
386
629
|
num_hits.times do |i|
|
|
387
|
-
ar[i] = SRF::OUT::Pep.new.from_handle(fh, global_ref_hash)
|
|
630
|
+
ar[i] = SRF::OUT::Pep.new.from_handle(fh, global_ref_hash, unpack_35)
|
|
631
|
+
num_extra_references += ar[i].num_other_loci
|
|
388
632
|
end
|
|
633
|
+
SRF::OUT::Pep.read_extra_references(fh, num_extra_references, ar, global_ref_hash)
|
|
389
634
|
## The xcorrs are already ordered by best to worst hit
|
|
390
635
|
## ADJUST the deltacn's to be meaningful for the top hit:
|
|
391
636
|
## (the same as bioworks and prophet)
|
|
392
|
-
(
|
|
393
|
-
ar.
|
|
637
|
+
SRF::OUT::Pep.set_deltacn_from_deltacn_orig(ar)
|
|
638
|
+
#puts ar.map {|a| a.deltacn }.join(", ")
|
|
394
639
|
end
|
|
395
|
-
self[
|
|
396
|
-
|
|
640
|
+
self[6] = ar
|
|
397
641
|
self
|
|
398
642
|
end
|
|
399
643
|
|
|
644
|
+
|
|
645
|
+
|
|
400
646
|
end
|
|
401
647
|
|
|
648
|
+
|
|
402
649
|
# deltacn is modified to be that of the next best hit (by xcorr).
|
|
650
|
+
# deltacn_orig is the one that sequest originally reports
|
|
403
651
|
# if there is no next best hit, then it will be 1.1 (like bioworks)
|
|
404
652
|
# mh is the theoretical mass + h
|
|
405
653
|
# prots are created as SRF prot objects with a reference and linked to their
|
|
406
654
|
# peptides (from global hash by reference)
|
|
407
655
|
# ppm = 10^6 * ∆m_accuracy / mass_measured [ where ∆m_accuracy = mass_real – mass_measured ]
|
|
408
656
|
# This is calculated for the M+H mass!
|
|
657
|
+
# num_other_loci is the number of other loci that the peptide matches beyond
|
|
658
|
+
# the first one listed
|
|
409
659
|
# srf = the srf object this scan came from
|
|
410
|
-
SRF::OUT::Pep = ArrayClass.new(%w( mh deltacn sp xcorr id rsp ions_matched ions_total sequence prots deltamass ppm aaseq base_name first_scan last_scan charge srf) )
|
|
411
660
|
|
|
412
|
-
|
|
661
|
+
SRF::OUT::Pep = ArrayClass.new(%w( mh deltacn_orig sp xcorr id num_other_loci rsp ions_matched ions_total sequence prots deltamass ppm aaseq base_name first_scan last_scan charge srf deltacn) )
|
|
662
|
+
|
|
663
|
+
# 0=mh 1=deltacn 2=sp 3=xcorr 4=id 5=num_other_loci 6=rsp 7=ions_matched 8=ions_total 9=sequence 10=prots 11=deltamass 12=ppm 13=aaseq 14=base_name 15=first_scan 16=last_scan 17=charge 18=srf 19=deltacn
|
|
413
664
|
|
|
414
665
|
class SRF::OUT::Pep
|
|
415
666
|
include SpecID::Pep
|
|
416
667
|
|
|
417
|
-
|
|
668
|
+
# creates the deltacn that is meaningful for the top hit (the deltacn_orig
|
|
669
|
+
# or the second best hit and so on).
|
|
670
|
+
# assumes sorted
|
|
671
|
+
def self.set_deltacn_from_deltacn_orig(ar)
|
|
672
|
+
(1...ar.size).each {|i| ar[i-1].deltacn = ar[i].deltacn_orig }
|
|
673
|
+
ar[-1].deltacn = 1.1
|
|
674
|
+
end
|
|
675
|
+
|
|
676
|
+
# same as set_deltacn_from_deltacn_orig except calculates with xcorr.
|
|
677
|
+
# assumes sorted
|
|
678
|
+
def self.set_deltacn_from_xcorr(ar)
|
|
679
|
+
if ar.size > 0
|
|
680
|
+
top_score = ar.first[3]
|
|
681
|
+
other_scores = (1...(ar.size)).to_a.map do |i|
|
|
682
|
+
(top_score - ar[i][3])/top_score
|
|
683
|
+
end
|
|
684
|
+
(0...(ar.size-1)).each do |i|
|
|
685
|
+
ar[i][19] = other_scores[i]
|
|
686
|
+
end
|
|
687
|
+
ar.last[19] = 1.1
|
|
688
|
+
end
|
|
689
|
+
end
|
|
690
|
+
|
|
691
|
+
def self.read_extra_references(fh, num_extra_references, pep_hits, global_ref_hash)
|
|
692
|
+
num_extra_references.times do
|
|
693
|
+
# 80 bytes total (with index number)
|
|
694
|
+
pep = pep_hits[fh.read(8).unpack('x4I').first - 1]
|
|
695
|
+
|
|
696
|
+
ref = fh.read(80).unpack('A*').first
|
|
697
|
+
pep[10] << pep.new_protein(ref[0,38], pep, global_ref_hash)
|
|
698
|
+
end
|
|
699
|
+
# fh.read(6) if unpack_35
|
|
700
|
+
end
|
|
701
|
+
|
|
702
|
+
# x2=???
|
|
703
|
+
#Unpack_35 = '@64Ex8ex12eeIx22vx2vvx8Z*@246Z*'
|
|
704
|
+
### NOTE:
|
|
705
|
+
# I need to verify that this is correct (I mean the 'I' after x18)
|
|
706
|
+
Unpack_35 = '@64Ex8ex12eeIx18Ivx2vvx8Z*@246Z*'
|
|
707
|
+
# translation: @64=(64 bytes in to the record), E=mH, x8=8unknown bytes, e=deltacn,
|
|
708
|
+
# x12=12unknown bytes, e=sp, e=xcorr, I=ID#, x18=18 unknown bytes, v=rsp,
|
|
709
|
+
# v=ions_matched, v=ions_total, x8=8unknown bytes, Z*=sequence, 240Z*=at
|
|
710
|
+
# byte 240 grab the string (which is proteins).
|
|
711
|
+
#Unpack_32 = '@64Ex8ex12eeIx18vvvx8Z*@240Z*'
|
|
712
|
+
Unpack_32 = '@64Ex8ex12eeIx14Ivvvx8Z*@240Z*'
|
|
418
713
|
Unpack_four_null_bytes = 'a*'
|
|
419
714
|
Unpack_Zstar = 'Z*'
|
|
715
|
+
Read_35 = 426
|
|
716
|
+
Read_32 = 320
|
|
420
717
|
|
|
421
718
|
FourNullBytes_as_string = "\0\0\0\0"
|
|
422
719
|
#NewRecordStart = "\0\0" + 0x3a.chr + 0x1a.chr + "\0\0"
|
|
423
720
|
NewRecordStart = 0x01.chr + 0x00.chr
|
|
424
721
|
Sequest_record_start = "[SEQUEST]"
|
|
425
722
|
|
|
426
|
-
|
|
427
|
-
def prots() self[9] end
|
|
428
|
-
$VERBOSE = tmp
|
|
429
|
-
|
|
723
|
+
undef_method :inspect
|
|
430
724
|
def inspect
|
|
431
|
-
st = %w(aaseq sequence mh
|
|
432
|
-
if v
|
|
725
|
+
st = %w(aaseq sequence mh deltacn_orig sp xcorr id rsp ions_matched ions_total prots deltamass ppm base_name first_scan last_scan charge deltacn).map do |v|
|
|
726
|
+
if v == 'prots'
|
|
727
|
+
"#{v}(#)=#{send(v.to_sym).size}"
|
|
728
|
+
elsif v.is_a? Array
|
|
433
729
|
"##{v}=#{send(v.to_sym).size}"
|
|
434
730
|
else
|
|
435
|
-
"
|
|
731
|
+
"#{v}=#{send(v.to_sym).inspect}"
|
|
436
732
|
end
|
|
437
733
|
end
|
|
438
734
|
st.unshift("<#{self.class}")
|
|
439
735
|
if srf
|
|
440
|
-
st.push("
|
|
736
|
+
st.push("srf(base_name)=#{srf.base_name.inspect}")
|
|
441
737
|
end
|
|
442
738
|
st.push('>')
|
|
443
739
|
st.join(' ')
|
|
444
740
|
#"<SRF::OUT::Pep @mh=#{mh}, @deltacn=#{deltacn}, @sp=#{sp}, @xcorr=#{xcorr}, @id=#{id}, @rsp=#{rsp}, @ions_matched=#{ions_matched}, @ions_total=#{ions_total}, @sequence=#{sequence}, @prots(count)=#{prots.size}, @deltamass=#{deltamass}, @ppm=#{ppm} @aaseq=#{aaseq}, @base_name=#{base_name}, @first_scan=#{first_scan}, @last_scan=#{last_scan}, @charge=#{charge}, @srf(base_name)=#{srf.base_name}>"
|
|
445
741
|
end
|
|
742
|
+
# extra_references_array is an array that grows with peptides as extra
|
|
743
|
+
# references are discovered.
|
|
744
|
+
def from_handle(fh, global_ref_hash, unpack_35)
|
|
745
|
+
unpack =
|
|
746
|
+
if unpack_35 ; Unpack_35
|
|
747
|
+
else ; Unpack_32
|
|
748
|
+
end
|
|
446
749
|
|
|
447
|
-
|
|
448
|
-
|
|
449
|
-
## so that we are in register for the next reading
|
|
450
|
-
def read_extra_references(fh, global_ref_hash)
|
|
451
|
-
$SRF_OUT_HIT_FH_POS = fh.pos
|
|
452
|
-
st = fh.read(4)
|
|
453
|
-
#puts "HHH: " + st.unpack("H*").first
|
|
454
|
-
## if we see 0000 0000 we are done
|
|
455
|
-
if st.unpack(Unpack_four_null_bytes).first == FourNullBytes_as_string
|
|
456
|
-
fh.pos = $SRF_OUT_HIT_FH_POS
|
|
457
|
-
return nil
|
|
458
|
-
end
|
|
459
|
-
# read in context of 4 bytes read above:
|
|
460
|
-
|
|
461
|
-
## NOTE: in context of 4 bytes read above!
|
|
462
|
-
st = fh.read(36)
|
|
463
|
-
if st[34,2] == NewRecordStart
|
|
464
|
-
fh.pos = $SRF_OUT_HIT_FH_POS
|
|
465
|
-
return nil
|
|
466
|
-
end
|
|
750
|
+
## get the first part of the info
|
|
751
|
+
st = fh.read(( unpack_35 ? Read_35 : Read_32) ) ## read all the hit data
|
|
467
752
|
|
|
468
|
-
|
|
469
|
-
## BACK to beginning of this section
|
|
470
|
-
fh.pos = $SRF_OUT_HIT_FH_POS
|
|
471
|
-
if fh.read(9) == Sequest_record_start
|
|
472
|
-
fh.pos = $SRF_OUT_HIT_FH_POS
|
|
473
|
-
return
|
|
474
|
-
end
|
|
753
|
+
self[0,10] = st.unpack(unpack)
|
|
475
754
|
|
|
476
|
-
|
|
477
|
-
|
|
478
|
-
|
|
479
|
-
self[9].push( new_protein(fh.read(80).unpack(Unpack_Zstar).first, self, global_ref_hash ) )
|
|
755
|
+
# we are slicing the reference to 38 chars to be the same length as
|
|
756
|
+
# duplicate references
|
|
757
|
+
self[10] = [new_protein(self[10][0,38], self, global_ref_hash)]
|
|
480
758
|
|
|
481
|
-
|
|
482
|
-
|
|
483
|
-
|
|
484
|
-
|
|
485
|
-
|
|
486
|
-
# abort
|
|
487
|
-
#end
|
|
488
|
-
|
|
489
|
-
read_extra_references(fh,global_ref_hash)
|
|
759
|
+
self[13] = SpecID::Pep.sequence_to_aaseq(self[9])
|
|
760
|
+
|
|
761
|
+
fh.read(6) if unpack_35
|
|
762
|
+
|
|
763
|
+
self
|
|
490
764
|
end
|
|
491
765
|
|
|
766
|
+
|
|
767
|
+
|
|
492
768
|
def new_protein(reference, peptide, global_ref_hash)
|
|
493
769
|
if global_ref_hash.key? reference
|
|
494
770
|
global_ref_hash[reference].peps << peptide
|
|
@@ -498,29 +774,20 @@ class SRF::OUT::Pep
|
|
|
498
774
|
global_ref_hash[reference]
|
|
499
775
|
end
|
|
500
776
|
|
|
501
|
-
|
|
502
|
-
## get the first part of the info
|
|
503
|
-
st = fh.read(320) ## read all the hit data
|
|
504
|
-
self[0,10] = st.unpack(Unpack)
|
|
505
|
-
# we are slicing the reference to 38 chars to be the same length as
|
|
506
|
-
# duplicate references
|
|
507
|
-
self[9] = [new_protein(self[9][0,38], self, global_ref_hash)]
|
|
508
|
-
self[12] = SpecID::Pep.sequence_to_aaseq(self[8])
|
|
509
|
-
read_extra_references(fh, global_ref_hash)
|
|
510
|
-
|
|
511
|
-
self
|
|
512
|
-
end
|
|
513
|
-
|
|
514
|
-
end
|
|
777
|
+
end
|
|
515
778
|
|
|
516
779
|
SRF::OUT::Prot = ArrayClass.new( %w(reference peps) )
|
|
517
780
|
|
|
518
781
|
class SRF::OUT::Prot
|
|
519
782
|
include SpecID::Prot
|
|
783
|
+
# we shouldn't have to do this because this is inlcuded in SpecID::Prot, but
|
|
784
|
+
# under some circumstances it won't work without explicitly calling it.
|
|
785
|
+
include ProteinReferenceable
|
|
520
786
|
|
|
521
787
|
tmp = $VERBOSE ; $VERBOSE = nil
|
|
522
788
|
def initialize(reference=nil, peps=[])
|
|
523
|
-
super(@@arr_size)
|
|
789
|
+
#super(@@arr_size)
|
|
790
|
+
super(size)
|
|
524
791
|
#@reference = reference
|
|
525
792
|
#@peps = peps
|
|
526
793
|
self[0,2] = reference, peps
|
|
@@ -529,6 +796,7 @@ class SRF::OUT::Prot
|
|
|
529
796
|
|
|
530
797
|
# "<SRF::OUT::Prot reference=\"#{@reference}\">"
|
|
531
798
|
|
|
799
|
+
undef_method :inspect
|
|
532
800
|
def inspect
|
|
533
801
|
"<SRF::OUT::Prot @reference=#{reference}, @peps(#)=#{peps.size}>"
|
|
534
802
|
end
|