mspire 0.4.9 → 0.5.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/README +27 -17
- data/changelog.txt +31 -62
- data/lib/ms/calc.rb +32 -0
- data/lib/ms/data/interleaved.rb +60 -0
- data/lib/ms/data/lazy_io.rb +73 -0
- data/lib/ms/data/lazy_string.rb +15 -0
- data/lib/ms/data/simple.rb +59 -0
- data/lib/ms/data/transposed.rb +41 -0
- data/lib/ms/data.rb +57 -0
- data/lib/ms/format/format_error.rb +12 -0
- data/lib/ms/spectrum.rb +25 -384
- data/lib/ms/support/binary_search.rb +126 -0
- data/lib/ms.rb +10 -10
- metadata +38 -350
- data/INSTALL +0 -58
- data/README.rdoc +0 -18
- data/Rakefile +0 -330
- data/bin/aafreqs.rb +0 -23
- data/bin/bioworks2excel.rb +0 -14
- data/bin/bioworks_to_pepxml.rb +0 -148
- data/bin/bioworks_to_pepxml_gui.rb +0 -225
- data/bin/fasta_shaker.rb +0 -5
- data/bin/filter_and_validate.rb +0 -5
- data/bin/gi2annot.rb +0 -14
- data/bin/id_class_anal.rb +0 -112
- data/bin/id_precision.rb +0 -172
- data/bin/ms_to_lmat.rb +0 -67
- data/bin/pepproph_filter.rb +0 -16
- data/bin/prob_validate.rb +0 -6
- data/bin/protein_summary.rb +0 -6
- data/bin/protxml2prots_peps.rb +0 -32
- data/bin/raw_to_mzXML.rb +0 -55
- data/bin/run_percolator.rb +0 -122
- data/bin/sqt_group.rb +0 -26
- data/bin/srf_group.rb +0 -27
- data/bin/srf_to_sqt.rb +0 -40
- data/lib/align/chams.rb +0 -78
- data/lib/align.rb +0 -154
- data/lib/archive/targz.rb +0 -94
- data/lib/bsearch.rb +0 -120
- data/lib/core_extensions.rb +0 -16
- data/lib/fasta.rb +0 -626
- data/lib/gi.rb +0 -124
- data/lib/group_by.rb +0 -10
- data/lib/index_by.rb +0 -11
- data/lib/merge_deep.rb +0 -21
- data/lib/ms/converter/mzxml.rb +0 -77
- data/lib/ms/gradient_program.rb +0 -170
- data/lib/ms/msrun.rb +0 -244
- data/lib/ms/msrun_index.rb +0 -108
- data/lib/ms/parser/mzdata/axml.rb +0 -67
- data/lib/ms/parser/mzdata/dom.rb +0 -175
- data/lib/ms/parser/mzdata/libxml.rb +0 -7
- data/lib/ms/parser/mzdata.rb +0 -31
- data/lib/ms/parser/mzxml/axml.rb +0 -70
- data/lib/ms/parser/mzxml/dom.rb +0 -182
- data/lib/ms/parser/mzxml/hpricot.rb +0 -253
- data/lib/ms/parser/mzxml/libxml.rb +0 -19
- data/lib/ms/parser/mzxml/regexp.rb +0 -122
- data/lib/ms/parser/mzxml/rexml.rb +0 -72
- data/lib/ms/parser/mzxml/xmlparser.rb +0 -248
- data/lib/ms/parser/mzxml.rb +0 -282
- data/lib/ms/parser.rb +0 -108
- data/lib/ms/precursor.rb +0 -25
- data/lib/ms/scan.rb +0 -81
- data/lib/mspire.rb +0 -4
- data/lib/pi_zero.rb +0 -244
- data/lib/qvalue.rb +0 -161
- data/lib/roc.rb +0 -187
- data/lib/sample_enzyme.rb +0 -160
- data/lib/scan_i.rb +0 -21
- data/lib/spec_id/aa_freqs.rb +0 -170
- data/lib/spec_id/bioworks.rb +0 -497
- data/lib/spec_id/digestor.rb +0 -138
- data/lib/spec_id/mass.rb +0 -179
- data/lib/spec_id/parser/proph.rb +0 -335
- data/lib/spec_id/precision/filter/cmdline.rb +0 -218
- data/lib/spec_id/precision/filter/interactive.rb +0 -134
- data/lib/spec_id/precision/filter/output.rb +0 -148
- data/lib/spec_id/precision/filter.rb +0 -637
- data/lib/spec_id/precision/output.rb +0 -60
- data/lib/spec_id/precision/prob/cmdline.rb +0 -160
- data/lib/spec_id/precision/prob/output.rb +0 -94
- data/lib/spec_id/precision/prob.rb +0 -249
- data/lib/spec_id/proph/pep_summary.rb +0 -104
- data/lib/spec_id/proph/prot_summary.rb +0 -484
- data/lib/spec_id/proph.rb +0 -4
- data/lib/spec_id/protein_summary.rb +0 -489
- data/lib/spec_id/sequest/params.rb +0 -316
- data/lib/spec_id/sequest/pepxml.rb +0 -1458
- data/lib/spec_id/sequest.rb +0 -33
- data/lib/spec_id/sqt.rb +0 -349
- data/lib/spec_id/srf.rb +0 -973
- data/lib/spec_id.rb +0 -778
- data/lib/spec_id_xml.rb +0 -99
- data/lib/transmem/phobius.rb +0 -147
- data/lib/transmem/toppred.rb +0 -368
- data/lib/transmem.rb +0 -157
- data/lib/validator/aa.rb +0 -48
- data/lib/validator/aa_est.rb +0 -112
- data/lib/validator/background.rb +0 -77
- data/lib/validator/bias.rb +0 -95
- data/lib/validator/cmdline.rb +0 -431
- data/lib/validator/decoy.rb +0 -107
- data/lib/validator/digestion_based.rb +0 -70
- data/lib/validator/probability.rb +0 -51
- data/lib/validator/prot_from_pep.rb +0 -234
- data/lib/validator/q_value.rb +0 -32
- data/lib/validator/transmem.rb +0 -272
- data/lib/validator/true_pos.rb +0 -46
- data/lib/validator.rb +0 -197
- data/lib/xml.rb +0 -38
- data/lib/xml_style_parser.rb +0 -119
- data/lib/xmlparser_wrapper.rb +0 -19
- data/release_notes.txt +0 -2
- data/script/compile_and_plot_smriti_final.rb +0 -97
- data/script/create_little_pepxml.rb +0 -61
- data/script/degenerate_peptides.rb +0 -47
- data/script/estimate_fpr_by_cysteine.rb +0 -226
- data/script/extract_gradient_programs.rb +0 -56
- data/script/find_cysteine_background.rb +0 -137
- data/script/genuine_tps_and_probs.rb +0 -136
- data/script/get_apex_values_rexml.rb +0 -44
- data/script/histogram_probs.rb +0 -61
- data/script/mascot_fix_pepxml.rb +0 -123
- data/script/msvis.rb +0 -42
- data/script/mzXML2timeIndex.rb +0 -25
- data/script/peps_per_bin.rb +0 -67
- data/script/prep_dir.rb +0 -121
- data/script/simple_protein_digestion.rb +0 -27
- data/script/smriti_final_analysis.rb +0 -103
- data/script/sqt_to_meta.rb +0 -24
- data/script/top_hit_per_scan.rb +0 -67
- data/script/toppred_to_yaml.rb +0 -47
- data/script/tpp_installer.rb +0 -249
- data/specs/align_spec.rb +0 -79
- data/specs/bin/bioworks_to_pepxml_spec.rb +0 -79
- data/specs/bin/fasta_shaker_spec.rb +0 -259
- data/specs/bin/filter_and_validate__multiple_vals_helper.yaml +0 -199
- data/specs/bin/filter_and_validate_spec.rb +0 -180
- data/specs/bin/ms_to_lmat_spec.rb +0 -34
- data/specs/bin/prob_validate_spec.rb +0 -86
- data/specs/bin/protein_summary_spec.rb +0 -14
- data/specs/fasta_spec.rb +0 -354
- data/specs/gi_spec.rb +0 -22
- data/specs/load_bin_path.rb +0 -7
- data/specs/merge_deep_spec.rb +0 -13
- data/specs/ms/gradient_program_spec.rb +0 -77
- data/specs/ms/msrun_spec.rb +0 -498
- data/specs/ms/parser_spec.rb +0 -92
- data/specs/ms/spectrum_spec.rb +0 -87
- data/specs/pi_zero_spec.rb +0 -115
- data/specs/qvalue_spec.rb +0 -39
- data/specs/roc_spec.rb +0 -251
- data/specs/rspec_autotest.rb +0 -149
- data/specs/sample_enzyme_spec.rb +0 -126
- data/specs/spec_helper.rb +0 -135
- data/specs/spec_id/aa_freqs_spec.rb +0 -52
- data/specs/spec_id/bioworks_spec.rb +0 -148
- data/specs/spec_id/digestor_spec.rb +0 -75
- data/specs/spec_id/precision/filter/cmdline_spec.rb +0 -20
- data/specs/spec_id/precision/filter/output_spec.rb +0 -31
- data/specs/spec_id/precision/filter_spec.rb +0 -246
- data/specs/spec_id/precision/prob_spec.rb +0 -44
- data/specs/spec_id/precision/prob_spec_helper.rb +0 -0
- data/specs/spec_id/proph/pep_summary_spec.rb +0 -98
- data/specs/spec_id/proph/prot_summary_spec.rb +0 -128
- data/specs/spec_id/protein_summary_spec.rb +0 -189
- data/specs/spec_id/sequest/params_spec.rb +0 -68
- data/specs/spec_id/sequest/pepxml_spec.rb +0 -374
- data/specs/spec_id/sequest_spec.rb +0 -38
- data/specs/spec_id/sqt_spec.rb +0 -246
- data/specs/spec_id/srf_spec.rb +0 -172
- data/specs/spec_id/srf_spec_helper.rb +0 -139
- data/specs/spec_id_helper.rb +0 -33
- data/specs/spec_id_spec.rb +0 -366
- data/specs/spec_id_xml_spec.rb +0 -33
- data/specs/transmem/phobius_spec.rb +0 -425
- data/specs/transmem/toppred_spec.rb +0 -298
- data/specs/transmem_spec.rb +0 -60
- data/specs/transmem_spec_shared.rb +0 -64
- data/specs/validator/aa_est_spec.rb +0 -66
- data/specs/validator/aa_spec.rb +0 -40
- data/specs/validator/background_spec.rb +0 -67
- data/specs/validator/bias_spec.rb +0 -122
- data/specs/validator/decoy_spec.rb +0 -51
- data/specs/validator/fasta_helper.rb +0 -26
- data/specs/validator/prot_from_pep_spec.rb +0 -141
- data/specs/validator/transmem_spec.rb +0 -146
- data/specs/validator/true_pos_spec.rb +0 -58
- data/specs/validator_helper.rb +0 -33
- data/specs/xml_spec.rb +0 -12
- data/test_files/000_pepxml18_small.xml +0 -206
- data/test_files/020a.mzXML.timeIndex +0 -4710
- data/test_files/4-03-03_mzXML/000.mzXML.timeIndex +0 -3973
- data/test_files/4-03-03_mzXML/020.mzXML.timeIndex +0 -3872
- data/test_files/4-03-03_small-prot.xml +0 -321
- data/test_files/4-03-03_small.xml +0 -3876
- data/test_files/7MIX_STD_110802_1.sequest_params_fragment.srf +0 -0
- data/test_files/bioworks-3.3_10prots.xml +0 -5999
- data/test_files/bioworks31.params +0 -77
- data/test_files/bioworks32.params +0 -62
- data/test_files/bioworks33.params +0 -63
- data/test_files/bioworks_single_run_small.xml +0 -7237
- data/test_files/bioworks_small.fasta +0 -212
- data/test_files/bioworks_small.params +0 -63
- data/test_files/bioworks_small.phobius +0 -109
- data/test_files/bioworks_small.toppred.out +0 -2847
- data/test_files/bioworks_small.xml +0 -5610
- data/test_files/bioworks_with_INV_small.xml +0 -3753
- data/test_files/bioworks_with_SHUFF_small.xml +0 -2503
- data/test_files/corrupted_900.srf +0 -0
- data/test_files/head_of_7MIX.srf +0 -0
- data/test_files/interact-opd1_mods_small-prot.xml +0 -304
- data/test_files/messups.fasta +0 -297
- data/test_files/opd1/000.my_answer.100lines.xml +0 -101
- data/test_files/opd1/000.tpp_1.2.3.first10.xml +0 -115
- data/test_files/opd1/000.tpp_2.9.2.first10.xml +0 -126
- data/test_files/opd1/000.v2.1.mzXML.timeIndex +0 -3748
- data/test_files/opd1/000_020-prot.png +0 -0
- data/test_files/opd1/000_020_3prots-prot.mod_initprob.xml +0 -62
- data/test_files/opd1/000_020_3prots-prot.xml +0 -62
- data/test_files/opd1/opd1_cat_inv_small-prot.xml +0 -139
- data/test_files/opd1/sequest.3.1.params +0 -77
- data/test_files/opd1/sequest.3.2.params +0 -62
- data/test_files/opd1/twenty_scans.mzXML +0 -418
- data/test_files/opd1/twenty_scans.v2.1.mzXML +0 -382
- data/test_files/opd1/twenty_scans_answ.lmat +0 -0
- data/test_files/opd1/twenty_scans_answ.lmata +0 -9
- data/test_files/opd1_020_beginning.RAW +0 -0
- data/test_files/opd1_2runs_2mods/data/020.mzData.xml +0 -683
- data/test_files/opd1_2runs_2mods/data/020.readw.mzXML +0 -382
- data/test_files/opd1_2runs_2mods/data/040.mzData.xml +0 -683
- data/test_files/opd1_2runs_2mods/data/040.readw.mzXML +0 -382
- data/test_files/opd1_2runs_2mods/data/README.txt +0 -6
- data/test_files/opd1_2runs_2mods/interact-opd1_mods__small.xml +0 -753
- data/test_files/orbitrap_mzData/000_cut.xml +0 -1920
- data/test_files/pepproph_small.xml +0 -4691
- data/test_files/phobius.small.noheader.txt +0 -50
- data/test_files/phobius.small.small.txt +0 -53
- data/test_files/s01_anC1_ld020mM.key.txt +0 -25
- data/test_files/s01_anC1_ld020mM.meth +0 -0
- data/test_files/small.fasta +0 -297
- data/test_files/small.sqt +0 -87
- data/test_files/smallraw.RAW +0 -0
- data/test_files/tf_bioworks2excel.bioXML +0 -14340
- data/test_files/tf_bioworks2excel.txt.actual +0 -1035
- data/test_files/toppred.small.out +0 -416
- data/test_files/toppred.xml.out +0 -318
- data/test_files/validator_hits_separate/bias_bioworks_small_HS.fasta +0 -7
- data/test_files/validator_hits_separate/bioworks_small_HS.xml +0 -5651
- data/test_files/yeast_gly_small-prot.xml +0 -265
- data/test_files/yeast_gly_small.1.0_1.0_1.0.parentTimes +0 -6
- data/test_files/yeast_gly_small.xml +0 -3807
- data/test_files/yeast_gly_small2.parentTimes +0 -6
data/lib/spec_id/srf.rb
DELETED
|
@@ -1,973 +0,0 @@
|
|
|
1
|
-
require 'fileutils'
|
|
2
|
-
|
|
3
|
-
require 'spec_id'
|
|
4
|
-
require 'spec_id/sequest'
|
|
5
|
-
require 'fasta'
|
|
6
|
-
require 'mspire'
|
|
7
|
-
require 'set'
|
|
8
|
-
|
|
9
|
-
require 'core_extensions'
|
|
10
|
-
|
|
11
|
-
module BinaryReader
|
|
12
|
-
Null_char = "\0"[0] ## TODO: change for ruby 1.9 or 2.0
|
|
13
|
-
# extracts a string with all empty chars at the end stripped
|
|
14
|
-
# expects the filehandle to be at the proper location
|
|
15
|
-
def get_null_padded_string(fh,bytes)
|
|
16
|
-
st = fh.read(bytes)
|
|
17
|
-
# for empty declarations
|
|
18
|
-
if st[0] == Null_char
|
|
19
|
-
return ''
|
|
20
|
-
end
|
|
21
|
-
st.rstrip!
|
|
22
|
-
st
|
|
23
|
-
end
|
|
24
|
-
end
|
|
25
|
-
|
|
26
|
-
# class to extract information from <file>_dta.log files
|
|
27
|
-
|
|
28
|
-
class SRFGroup
|
|
29
|
-
include SpecID
|
|
30
|
-
|
|
31
|
-
## the srf objects themselves
|
|
32
|
-
attr_accessor :srfs, :filenames
|
|
33
|
-
## also inherits :peps and :prots accessor
|
|
34
|
-
|
|
35
|
-
# takes an array of filenames
|
|
36
|
-
# or a single .srg filename
|
|
37
|
-
# see from_srg to load a single .srg file
|
|
38
|
-
# by default, the hits will be returned filtered by sequest params values.
|
|
39
|
-
# [The raw SRF data is unfiltered!]
|
|
40
|
-
def initialize(filenames=nil, filter_hits_by_params=true)
|
|
41
|
-
@filenames = filenames
|
|
42
|
-
@peps = []
|
|
43
|
-
@prots = []
|
|
44
|
-
@srfs = []
|
|
45
|
-
|
|
46
|
-
# This is essentially duplicated in SQTGroup (should refactor eventually)
|
|
47
|
-
global_ref_hash = {}
|
|
48
|
-
if filenames
|
|
49
|
-
if filenames.is_a?(String) && filenames =~ /\.srg$/
|
|
50
|
-
srg_filename = filenames.dup
|
|
51
|
-
@filename = srg_filename
|
|
52
|
-
filenames = SRFGroup.srg_to_paths(filenames)
|
|
53
|
-
filenames.each do |file|
|
|
54
|
-
if !File.exist? file
|
|
55
|
-
puts "File: #{file} in #{srg_filename} does not exist!"
|
|
56
|
-
puts "Please modify #{srg_filename} to point to existing files."
|
|
57
|
-
abort
|
|
58
|
-
end
|
|
59
|
-
end
|
|
60
|
-
end
|
|
61
|
-
filenames.each do |file|
|
|
62
|
-
@srfs << SRF.new(file, @peps, global_ref_hash)
|
|
63
|
-
end
|
|
64
|
-
@prots = global_ref_hash.values
|
|
65
|
-
if filter_hits_by_params
|
|
66
|
-
filter_by_peptide_mass_tolerance
|
|
67
|
-
end
|
|
68
|
-
end
|
|
69
|
-
end
|
|
70
|
-
|
|
71
|
-
# reads a srg file and delivers the path names
|
|
72
|
-
def self.srg_to_paths(file)
|
|
73
|
-
IO.readlines(file).grep(/\w/).map {|v| v.chomp }
|
|
74
|
-
end
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
# if srfs were read in separately, then the proteins will need to be merged
|
|
78
|
-
# by their reference
|
|
79
|
-
def merge_different_sets(srfs)
|
|
80
|
-
raise NotImplementedError, "need to implement?"
|
|
81
|
-
end
|
|
82
|
-
|
|
83
|
-
# 1. sets @prots and returns it: a new list of proteins based on which
|
|
84
|
-
# peptides passed.
|
|
85
|
-
# 2. updates the out_file's list of hits based on passing peptides (but not
|
|
86
|
-
# the original hit id; rank is implicit in array ordering)
|
|
87
|
-
# 3. updates each protein to only include peptides passing thresholds.
|
|
88
|
-
# [Note, this process is how .out files are generated!]
|
|
89
|
-
# 4. recalculates deltacn values completely if number of hits changed (does
|
|
90
|
-
# not touch deltacn orig)
|
|
91
|
-
# ASSUMES:
|
|
92
|
-
# A. all srfs have identical params objects and each has a
|
|
93
|
-
# peptide_mass_tolerance filter attribute.
|
|
94
|
-
# B. proteins are already unique (peptides referencing the same protein
|
|
95
|
-
# reference the same object already) In practice, this means all srfs were
|
|
96
|
-
# read in together.
|
|
97
|
-
def filter_by_peptide_mass_tolerance
|
|
98
|
-
prots_in_set = Set.new
|
|
99
|
-
params = @srfs.first.params
|
|
100
|
-
pmt = params.peptide_mass_tolerance.to_f
|
|
101
|
-
methd = nil # the method to
|
|
102
|
-
|
|
103
|
-
case params.peptide_mass_units
|
|
104
|
-
when '0'
|
|
105
|
-
amu_based = true
|
|
106
|
-
milli_amu = false
|
|
107
|
-
when '1'
|
|
108
|
-
amu_based = true
|
|
109
|
-
milli_amu = true
|
|
110
|
-
when '2'
|
|
111
|
-
amu_based = false
|
|
112
|
-
end
|
|
113
|
-
|
|
114
|
-
@srfs.each do |srf|
|
|
115
|
-
srf.filtered_by_precursor_mass_tolerance = true
|
|
116
|
-
srf.out_files.each do |out_file|
|
|
117
|
-
hits = out_file.hits
|
|
118
|
-
before = hits.size
|
|
119
|
-
hits.reject! do |pep|
|
|
120
|
-
do_not_keep =
|
|
121
|
-
if amu_based
|
|
122
|
-
if milli_amu
|
|
123
|
-
(pep.deltamass.abs > (pmt/1000))
|
|
124
|
-
else
|
|
125
|
-
(pep.deltamass.abs > pmt)
|
|
126
|
-
end
|
|
127
|
-
else
|
|
128
|
-
(pep.ppm.abs > pmt)
|
|
129
|
-
end
|
|
130
|
-
unless do_not_keep
|
|
131
|
-
pep.prots.each do |prot|
|
|
132
|
-
if prots_in_set.include?(prot)
|
|
133
|
-
prot.peps << pep
|
|
134
|
-
else
|
|
135
|
-
prots_in_set.add(prot)
|
|
136
|
-
prot.peps = [pep]
|
|
137
|
-
end
|
|
138
|
-
end
|
|
139
|
-
end
|
|
140
|
-
do_not_keep
|
|
141
|
-
end
|
|
142
|
-
if hits.size != before
|
|
143
|
-
SRF::OUT::Pep.update_deltacns_from_xcorr(hits)
|
|
144
|
-
out_file.num_hits = hits.size
|
|
145
|
-
end
|
|
146
|
-
end
|
|
147
|
-
end
|
|
148
|
-
@prots = prots_in_set.to_a
|
|
149
|
-
|
|
150
|
-
end
|
|
151
|
-
|
|
152
|
-
# returns the filename used
|
|
153
|
-
# if the file exists, the name will be expanded to full path, otherwise just
|
|
154
|
-
# what is given
|
|
155
|
-
def to_srg(srg_filename='bioworks.srg')
|
|
156
|
-
File.open(srg_filename, 'w') do |v|
|
|
157
|
-
@filenames.each do |srf_file|
|
|
158
|
-
if File.exist? srf_file
|
|
159
|
-
v.puts File.expand_path(srf_file)
|
|
160
|
-
else
|
|
161
|
-
v.puts srf_file
|
|
162
|
-
end
|
|
163
|
-
end
|
|
164
|
-
end
|
|
165
|
-
srg_filename
|
|
166
|
-
end
|
|
167
|
-
end
|
|
168
|
-
|
|
169
|
-
class SRF
|
|
170
|
-
|
|
171
|
-
# a string 3.5, 3.3 or 3.2
|
|
172
|
-
attr_accessor :version
|
|
173
|
-
|
|
174
|
-
attr_accessor :header
|
|
175
|
-
attr_accessor :dta_files
|
|
176
|
-
attr_accessor :out_files
|
|
177
|
-
attr_accessor :params
|
|
178
|
-
# a parallel array to dta_files and out_files where each entry is:
|
|
179
|
-
# [first_scan, last_scan, charge]
|
|
180
|
-
attr_accessor :index
|
|
181
|
-
attr_accessor :base_name
|
|
182
|
-
# this is the global peptides array
|
|
183
|
-
attr_accessor :peps
|
|
184
|
-
MASCOT_HYDROGEN_MASS = 1.007276
|
|
185
|
-
|
|
186
|
-
attr_accessor :filtered_by_precursor_mass_tolerance
|
|
187
|
-
|
|
188
|
-
# returns a Sequest::Params object
|
|
189
|
-
def self.get_sequest_params(filename)
|
|
190
|
-
# split the file in half and only read the second half (since we can be
|
|
191
|
-
# confident that the params file will be there!)
|
|
192
|
-
File.open(filename) do |handle|
|
|
193
|
-
halfway = handle.stat.size / 2
|
|
194
|
-
handle.seek halfway
|
|
195
|
-
last_half = handle.read
|
|
196
|
-
params_start_index = last_half.rindex('[SEQUEST]') + halfway
|
|
197
|
-
handle.seek(params_start_index)
|
|
198
|
-
Sequest::Params.new.parse_handle(handle)
|
|
199
|
-
end
|
|
200
|
-
end
|
|
201
|
-
|
|
202
|
-
def dta_start_byte
|
|
203
|
-
case @version
|
|
204
|
-
when '3.2' ; 3260
|
|
205
|
-
when '3.3' ; 3644
|
|
206
|
-
when '3.5' ; 3644
|
|
207
|
-
end
|
|
208
|
-
end
|
|
209
|
-
|
|
210
|
-
# peps and global_ref_hash are created as the srf files is read. If the
|
|
211
|
-
# file is read as part of a group, then these should be passed in.
|
|
212
|
-
# NOTE: if you want the hits filtered by precursor tolerance (the way they
|
|
213
|
-
# might be displayed in .out files) you should probably use SRFGroup (which
|
|
214
|
-
# does this by default)
|
|
215
|
-
# SRF is meant to be a low level read of the file.
|
|
216
|
-
def initialize(filename=nil, peps=[], global_ref_hash={})
|
|
217
|
-
@dta_files = []
|
|
218
|
-
@out_files = []
|
|
219
|
-
if filename
|
|
220
|
-
from_file(filename, peps, global_ref_hash)
|
|
221
|
-
end
|
|
222
|
-
end
|
|
223
|
-
|
|
224
|
-
def round(float, decimal_places)
|
|
225
|
-
sprintf("%.#{decimal_places}f", float)
|
|
226
|
-
end
|
|
227
|
-
|
|
228
|
-
# this mimicks the output of merge.pl from mascot
|
|
229
|
-
# The only difference is that this does not include the "\r\n"
|
|
230
|
-
# that is found after the peak lists, instead, it uses "\n" throughout the
|
|
231
|
-
# file (thinking that this is preferable to mixing newline styles!)
|
|
232
|
-
# note that Mass
|
|
233
|
-
# if no filename is given, will use base_name + '.mgf'
|
|
234
|
-
def to_mgf_file(filename=nil)
|
|
235
|
-
filename =
|
|
236
|
-
if filename ; filename
|
|
237
|
-
else
|
|
238
|
-
base_name + '.mgf'
|
|
239
|
-
end
|
|
240
|
-
h_plus = SpecID::MONO[:h_plus]
|
|
241
|
-
File.open(filename, 'wb') do |out|
|
|
242
|
-
dta_files.zip(index) do |dta, i_ar|
|
|
243
|
-
chrg = dta.charge
|
|
244
|
-
out.puts 'BEGIN IONS'
|
|
245
|
-
out.puts "TITLE=#{[base_name, *i_ar].push('dta').join('.')}"
|
|
246
|
-
out.puts "CHARGE=#{chrg}+"
|
|
247
|
-
out.puts "PEPMASS=#{(dta.mh+((chrg-1)*h_plus))/chrg}"
|
|
248
|
-
peak_ar = dta.peaks.unpack('e*')
|
|
249
|
-
(0...(peak_ar.size)).step(2) do |i|
|
|
250
|
-
out.puts( peak_ar[i,2].join(' ') )
|
|
251
|
-
end
|
|
252
|
-
out.puts ''
|
|
253
|
-
out.puts 'END IONS'
|
|
254
|
-
out.puts ''
|
|
255
|
-
end
|
|
256
|
-
end
|
|
257
|
-
end
|
|
258
|
-
|
|
259
|
-
# not given an out_folder, will make one with the basename
|
|
260
|
-
# compress may be: :zip, :tgz, or nil (no compression)
|
|
261
|
-
# :zip requires gem rubyzip to be installed and is *very* bloated
|
|
262
|
-
# as it writes out all the files first!
|
|
263
|
-
# :tgz requires gem archive-tar-minitar to be installed
|
|
264
|
-
def to_dta_files(out_folder=nil, compress=nil)
|
|
265
|
-
outdir =
|
|
266
|
-
if out_folder ; out_folder
|
|
267
|
-
else base_name
|
|
268
|
-
end
|
|
269
|
-
|
|
270
|
-
case compress
|
|
271
|
-
when :tgz
|
|
272
|
-
begin
|
|
273
|
-
require 'archive/tar/minitar'
|
|
274
|
-
rescue LoadError
|
|
275
|
-
abort "need gem 'archive-tar-minitar' installed' for tgz compression!\n#{$!}"
|
|
276
|
-
end
|
|
277
|
-
require 'archive/targz' # my own simplified interface!
|
|
278
|
-
require 'zlib'
|
|
279
|
-
names = index.map do |i_ar|
|
|
280
|
-
[outdir, '/', [base_name, *i_ar].join('.'), '.dta'].join('')
|
|
281
|
-
end
|
|
282
|
-
#Archive::Targz.archive_as_files(outdir + '.tgz', names, dta_file_data)
|
|
283
|
-
|
|
284
|
-
tgz = Zlib::GzipWriter.new(File.open(outdir + '.tgz', 'wb'))
|
|
285
|
-
|
|
286
|
-
Archive::Tar::Minitar::Output.open(tgz) do |outp|
|
|
287
|
-
dta_files.each_with_index do |dta_file, i|
|
|
288
|
-
Archive::Tar::Minitar.pack_as_file(names[i], dta_file.to_dta_file_data, outp)
|
|
289
|
-
end
|
|
290
|
-
end
|
|
291
|
-
when :zip
|
|
292
|
-
begin
|
|
293
|
-
require 'zip/zipfilesystem'
|
|
294
|
-
rescue LoadError
|
|
295
|
-
abort "need gem 'rubyzip' installed' for zip compression!\n#{$!}"
|
|
296
|
-
end
|
|
297
|
-
#begin ; require 'zip/zipfilesystem' ; rescue LoadError, "need gem 'rubyzip' installed' for zip compression!\n#{$!}" ; end
|
|
298
|
-
Zip::ZipFile.open(outdir + ".zip", Zip::ZipFile::CREATE) do |zfs|
|
|
299
|
-
dta_files.zip(index) do |dta,i_ar|
|
|
300
|
-
#zfs.mkdir(outdir)
|
|
301
|
-
zfs.get_output_stream(outdir + '/' + [base_name, *i_ar].join('.') + '.dta') do |out|
|
|
302
|
-
dta.write_dta_file(out)
|
|
303
|
-
#zfs.commit
|
|
304
|
-
end
|
|
305
|
-
end
|
|
306
|
-
end
|
|
307
|
-
else # no compression
|
|
308
|
-
FileUtils.mkpath(outdir)
|
|
309
|
-
Dir.chdir(outdir) do
|
|
310
|
-
dta_files.zip(index) do |dta,i_ar|
|
|
311
|
-
File.open([base_name, *i_ar].join('.') << '.dta', 'wb') do |out|
|
|
312
|
-
dta.write_dta_file(out)
|
|
313
|
-
end
|
|
314
|
-
end
|
|
315
|
-
end
|
|
316
|
-
end
|
|
317
|
-
end
|
|
318
|
-
|
|
319
|
-
# the out_filename will be the base_name + .sqt unless 'out_filename' is
|
|
320
|
-
# defined
|
|
321
|
-
# :round => round floating point numbers
|
|
322
|
-
# etc...
|
|
323
|
-
def to_sqt(out_filename=nil, opts={})
|
|
324
|
-
tic_dp = 2
|
|
325
|
-
mh_dp = 7
|
|
326
|
-
xcorr_dp = 5
|
|
327
|
-
sp_dp = 2
|
|
328
|
-
dcn_dp = 5
|
|
329
|
-
|
|
330
|
-
defaults = {:db_info=>false, :new_db_path=>nil, :update_db_path=>false, :round=>false}
|
|
331
|
-
opt = defaults.merge(opts)
|
|
332
|
-
|
|
333
|
-
outfile =
|
|
334
|
-
if out_filename
|
|
335
|
-
out_filename
|
|
336
|
-
else
|
|
337
|
-
base_name + '.sqt'
|
|
338
|
-
end
|
|
339
|
-
invariant_ordering = %w(SQTGenerator SQTGeneratorVersion Database FragmentMasses PrecursorMasses StartTime) # just for readability and consistency
|
|
340
|
-
fmt =
|
|
341
|
-
if params.fragment_mass_type == 'average' ; 'AVG'
|
|
342
|
-
else ; 'MONO'
|
|
343
|
-
end
|
|
344
|
-
pmt =
|
|
345
|
-
if params.precursor_mass_type == 'average' ; 'AVG'
|
|
346
|
-
else ; 'MONO'
|
|
347
|
-
end
|
|
348
|
-
|
|
349
|
-
mass_table = params.mass_table
|
|
350
|
-
static_mods = params.static_mods.map do |k,v|
|
|
351
|
-
key = k.split(/_/)[1]
|
|
352
|
-
if key.size == 1
|
|
353
|
-
key + '=' + (mass_table[key.to_sym] + v.to_f).to_s
|
|
354
|
-
else
|
|
355
|
-
key + '=' + v
|
|
356
|
-
end
|
|
357
|
-
end
|
|
358
|
-
|
|
359
|
-
dynamic_mods = []
|
|
360
|
-
header.modifications.scan(/\((.*?)\)/) do |match|
|
|
361
|
-
dynamic_mods << match.first.sub(/ /,'=')
|
|
362
|
-
end
|
|
363
|
-
plural = {
|
|
364
|
-
'StaticMod' => static_mods,
|
|
365
|
-
'DynamicMod' => dynamic_mods, # example as diff mod
|
|
366
|
-
'Comment' => ['Created from Bioworks .srf file']
|
|
367
|
-
}
|
|
368
|
-
|
|
369
|
-
|
|
370
|
-
db_filename = header.db_filename
|
|
371
|
-
db_filename_in_sqt = db_filename
|
|
372
|
-
if opt[:new_db_path]
|
|
373
|
-
db_filename = File.join(opt[:new_db_path], File.basename(db_filename.gsub('\\', '/')))
|
|
374
|
-
if opt[:update_db_path]
|
|
375
|
-
db_filename_in_sqt = File.expand_path(db_filename)
|
|
376
|
-
warn "writing Database #{db_filename} to sqt, but it does not exist on this file system" unless File.exist?(db_filename)
|
|
377
|
-
end
|
|
378
|
-
end
|
|
379
|
-
|
|
380
|
-
apmu =
|
|
381
|
-
case params.peptide_mass_units
|
|
382
|
-
when '0' : 'amu'
|
|
383
|
-
when '1' : 'mmu'
|
|
384
|
-
when '2' : 'ppm'
|
|
385
|
-
end
|
|
386
|
-
|
|
387
|
-
hh = {
|
|
388
|
-
'SQTGenerator' => 'mspire',
|
|
389
|
-
'SQTGeneratorVersion' => Mspire::Version,
|
|
390
|
-
'Database' => db_filename_in_sqt,
|
|
391
|
-
'FragmentMasses' => fmt,
|
|
392
|
-
'PrecursorMasses' => pmt,
|
|
393
|
-
'StartTime' => '', # Bioworks 3.2 also leaves this blank...
|
|
394
|
-
'Alg-PreMassTol' => params.peptide_mass_tolerance,
|
|
395
|
-
'Alg-FragMassTol' => params.fragment_ion_tolerance,
|
|
396
|
-
'Alg-PreMassUnits' => apmu, ## mine
|
|
397
|
-
'Alg-IonSeries' => header.ion_series.split(':').last.lstrip,
|
|
398
|
-
'Alg-Enzyme' => header.enzyme.split(':').last,
|
|
399
|
-
'Alg-MSModel' => header.model,
|
|
400
|
-
}
|
|
401
|
-
|
|
402
|
-
if opt[:db_info]
|
|
403
|
-
if File.exist?(db_filename)
|
|
404
|
-
reply = get_db_info_for_sqt(db_filename)
|
|
405
|
-
%w(DBSeqLength DBLocusCount DBMD5Sum).zip(reply) do |label,val|
|
|
406
|
-
hh[label] = val
|
|
407
|
-
end
|
|
408
|
-
else
|
|
409
|
-
warn "file #{db_filename} does not exist, no extra db info in header!"
|
|
410
|
-
end
|
|
411
|
-
end
|
|
412
|
-
|
|
413
|
-
has_hits = (self.out_files.size > 0)
|
|
414
|
-
if has_hits
|
|
415
|
-
# somewhat redundant with above, but we can get this without a db present!
|
|
416
|
-
hh['DBLocusCount'] = self.out_files.first.db_locus_count
|
|
417
|
-
end
|
|
418
|
-
|
|
419
|
-
File.open(outfile, 'w') do |out|
|
|
420
|
-
# print the header:
|
|
421
|
-
invariant_ordering.each do |iv|
|
|
422
|
-
out.puts ['H', iv, hh.delete(iv)].join("\t")
|
|
423
|
-
end
|
|
424
|
-
hh.each do |k,v|
|
|
425
|
-
out.puts ['H', k, v].join("\t")
|
|
426
|
-
end
|
|
427
|
-
plural.each do |k,vals|
|
|
428
|
-
vals.each do |val|
|
|
429
|
-
out.puts ['H', k, val].join("\t")
|
|
430
|
-
end
|
|
431
|
-
end
|
|
432
|
-
|
|
433
|
-
##### SPECTRA
|
|
434
|
-
time_to_process = '0.0'
|
|
435
|
-
#########################################
|
|
436
|
-
# NEED TO FIGURE OUT: (in spectra guy)
|
|
437
|
-
# * Lowest Sp value for top 500 spectra
|
|
438
|
-
# * Number of sequences matching this precursor ion
|
|
439
|
-
#########################################
|
|
440
|
-
|
|
441
|
-
manual_validation_status = 'U'
|
|
442
|
-
self.out_files.zip(dta_files) do |out_file, dta_file|
|
|
443
|
-
# don't have the time to process (using 0.0 like bioworks 3.2)
|
|
444
|
-
dta_file_mh = dta_file.mh
|
|
445
|
-
out_file_total_inten = out_file.total_inten
|
|
446
|
-
out_file_lowest_sp = out_file.lowest_sp
|
|
447
|
-
if opt[:round]
|
|
448
|
-
dta_file_mh = round(dta_file_mh, mh_dp)
|
|
449
|
-
out_file_total_inten = round(out_file_total_inten, tic_dp)
|
|
450
|
-
out_file_lowest_sp = round(out_file_lowest_sp, sp_dp)
|
|
451
|
-
end
|
|
452
|
-
|
|
453
|
-
out.puts ['S', out_file.first_scan, out_file.last_scan, out_file.charge, time_to_process, out_file.computer, dta_file_mh, out_file_total_inten, out_file_lowest_sp, out_file.num_matched_peptides].join("\t")
|
|
454
|
-
out_file.hits.each_with_index do |hit,index|
|
|
455
|
-
hit_mh = hit.mh
|
|
456
|
-
hit_deltacn_orig_updated = hit.deltacn_orig_updated
|
|
457
|
-
hit_xcorr = hit.xcorr
|
|
458
|
-
hit_sp = hit.sp
|
|
459
|
-
if opt[:round]
|
|
460
|
-
hit_mh = round(hit_mh, mh_dp)
|
|
461
|
-
hit_deltacn_orig_updated = round(hit_deltacn_orig_updated, dcn_dp)
|
|
462
|
-
hit_xcorr = round(hit_xcorr, xcorr_dp)
|
|
463
|
-
hit_sp = round(hit_sp, sp_dp)
|
|
464
|
-
end
|
|
465
|
-
# note that the rank is determined by the order..
|
|
466
|
-
out.puts ['M', index+1, hit.rsp, hit_mh, hit_deltacn_orig_updated, hit_xcorr, hit_sp, hit.ions_matched, hit.ions_total, hit.sequence, manual_validation_status].join("\t")
|
|
467
|
-
hit.prots.each do |prot|
|
|
468
|
-
out.puts ['L', prot.first_entry].join("\t")
|
|
469
|
-
end
|
|
470
|
-
end
|
|
471
|
-
end
|
|
472
|
-
end # close the filehandle
|
|
473
|
-
|
|
474
|
-
end
|
|
475
|
-
|
|
476
|
-
# assumes the file exists and is readable
|
|
477
|
-
# returns [DBSeqLength, DBLocusCount, DBMD5Sum] or nil if no file
|
|
478
|
-
def get_db_info_for_sqt(dbfile)
|
|
479
|
-
fasta = Fasta.new(dbfile)
|
|
480
|
-
[fasta.aa_seq_length, fasta.size, fasta.md5_sum]
|
|
481
|
-
end
|
|
482
|
-
|
|
483
|
-
|
|
484
|
-
# returns self
|
|
485
|
-
def from_file(filename, peps, global_ref_hash)
|
|
486
|
-
dups = SRF.get_sequest_params(filename).print_duplicate_references
|
|
487
|
-
if dups == '0'
|
|
488
|
-
raise RuntimeError, <<END
|
|
489
|
-
|
|
490
|
-
***************************************************************************
|
|
491
|
-
Sorry, but the SRF reader cannot read this file!
|
|
492
|
-
.srf files must currently be created with print_duplicate_references > 0
|
|
493
|
-
(This is how the srf object can link peptides with proteins!)
|
|
494
|
-
To capture all duplicate references, set the sequest parameter
|
|
495
|
-
'print_duplicate_references' to 100 or greater.
|
|
496
|
-
***************************************************************************
|
|
497
|
-
END
|
|
498
|
-
end
|
|
499
|
-
|
|
500
|
-
File.open(filename, "rb") do |fh|
|
|
501
|
-
@header = SRF::Header.new.from_handle(fh)
|
|
502
|
-
@version = @header.version
|
|
503
|
-
|
|
504
|
-
unpack_35 = case @version
|
|
505
|
-
when '3.2'
|
|
506
|
-
false
|
|
507
|
-
when '3.3'
|
|
508
|
-
false
|
|
509
|
-
when '3.5'
|
|
510
|
-
true
|
|
511
|
-
end
|
|
512
|
-
@dta_files, measured_mhs = read_dta_files(fh,@header.num_dta_files, unpack_35)
|
|
513
|
-
|
|
514
|
-
@out_files = read_out_files(fh,@header.num_dta_files, global_ref_hash, measured_mhs, unpack_35)
|
|
515
|
-
if fh.eof?
|
|
516
|
-
warn "FILE: '#{filename}' appears to be an abortive run (no params in srf file)\nstill continuing..."
|
|
517
|
-
@params = nil
|
|
518
|
-
@index = []
|
|
519
|
-
else
|
|
520
|
-
@params = Sequest::Params.new.parse_handle(fh)
|
|
521
|
-
# This is very sensitive to the grab_params method in sequest params
|
|
522
|
-
fh.read(12) ## gap between last params entry and index
|
|
523
|
-
@index = read_scan_index(fh,@header.num_dta_files)
|
|
524
|
-
end
|
|
525
|
-
end
|
|
526
|
-
|
|
527
|
-
### UPDATE SOME THINGS ON SINGLE PASS:
|
|
528
|
-
@base_name = @header.raw_filename.scan(/[\\\/]([^\\\/]+)\.RAW$/).first.first
|
|
529
|
-
# give each hit a base_name, first_scan, last_scan
|
|
530
|
-
@index.each_with_index do |ind,i|
|
|
531
|
-
mass_measured = @dta_files[i][0]
|
|
532
|
-
#puts @out_files[i].join(", ")
|
|
533
|
-
@out_files[i][0,3] = *ind
|
|
534
|
-
pep_hits = @out_files[i][6]
|
|
535
|
-
peps.push( *pep_hits )
|
|
536
|
-
pep_hits.each do |pep_hit|
|
|
537
|
-
pep_hit[14,4] = @base_name, *ind
|
|
538
|
-
# add the deltamass
|
|
539
|
-
pep_hit[11] = pep_hit[0] - mass_measured # real - measured (deltamass)
|
|
540
|
-
pep_hit[12] = 1.0e6 * pep_hit[11].abs / mass_measured ## ppm
|
|
541
|
-
pep_hit[18] = self ## link with the srf object
|
|
542
|
-
end
|
|
543
|
-
end
|
|
544
|
-
self
|
|
545
|
-
end
|
|
546
|
-
|
|
547
|
-
# returns an index where each entry is [first_scan, last_scan, charge]
|
|
548
|
-
def read_scan_index(fh, num)
|
|
549
|
-
ind_len = 24
|
|
550
|
-
index = Array.new(num)
|
|
551
|
-
unpack_string = 'III'
|
|
552
|
-
st = ''
|
|
553
|
-
ind_len.times do st << '0' end ## create a 24 byte string to receive data
|
|
554
|
-
num.times do |i|
|
|
555
|
-
fh.read(ind_len, st)
|
|
556
|
-
index[i] = st.unpack(unpack_string)
|
|
557
|
-
end
|
|
558
|
-
index
|
|
559
|
-
end
|
|
560
|
-
|
|
561
|
-
# returns an array of dta_files
|
|
562
|
-
def read_dta_files(fh, num_files, unpack_35)
|
|
563
|
-
measured_mhs = Array.new(num_files) ## A parallel array to capture the actual mh
|
|
564
|
-
dta_files = Array.new(num_files)
|
|
565
|
-
start = dta_start_byte
|
|
566
|
-
unless fh.pos == start
|
|
567
|
-
fh.pos = start
|
|
568
|
-
end
|
|
569
|
-
|
|
570
|
-
header.num_dta_files.times do |i|
|
|
571
|
-
dta_file = SRF::DTA.new.from_handle(fh, unpack_35)
|
|
572
|
-
measured_mhs[i] = dta_file[0]
|
|
573
|
-
dta_files[i] = dta_file
|
|
574
|
-
end
|
|
575
|
-
[dta_files, measured_mhs]
|
|
576
|
-
end
|
|
577
|
-
|
|
578
|
-
# filehandle (fh) must be at the start of the outfiles. 'read_dta_files'
|
|
579
|
-
# will put the fh there.
|
|
580
|
-
def read_out_files(fh,number_files, global_ref_hash, measured_mhs, unpack_35)
|
|
581
|
-
out_files = Array.new(number_files)
|
|
582
|
-
header.num_dta_files.times do |i|
|
|
583
|
-
out_files[i] = SRF::OUT.new.from_handle(fh, global_ref_hash, unpack_35)
|
|
584
|
-
end
|
|
585
|
-
out_files
|
|
586
|
-
end
|
|
587
|
-
|
|
588
|
-
end
|
|
589
|
-
|
|
590
|
-
class SRF::Header
|
|
591
|
-
include BinaryReader
|
|
592
|
-
|
|
593
|
-
Start_byte = {
|
|
594
|
-
:enzyme => 438,
|
|
595
|
-
:ion_series => 694,
|
|
596
|
-
:model => 950,
|
|
597
|
-
:modifications => 982,
|
|
598
|
-
:raw_filename => 1822,
|
|
599
|
-
:db_filename => 2082,
|
|
600
|
-
:dta_log_filename => 2602,
|
|
601
|
-
:params_filename => 3122,
|
|
602
|
-
:sequest_log_filename => 3382,
|
|
603
|
-
}
|
|
604
|
-
Byte_length = {
|
|
605
|
-
:enzyme => 256,
|
|
606
|
-
:ion_series => 256,
|
|
607
|
-
:model => 32,
|
|
608
|
-
:modifications => 840,
|
|
609
|
-
:raw_filename => 260,
|
|
610
|
-
:db_filename => 520,
|
|
611
|
-
:dta_log_filename => 520,
|
|
612
|
-
:params_filename => 260,
|
|
613
|
-
:sequest_log_filename => 262, ## is this really 262?? or should be 260??
|
|
614
|
-
}
|
|
615
|
-
Byte_length_v32 = {
|
|
616
|
-
:modifications => 456,
|
|
617
|
-
}
|
|
618
|
-
|
|
619
|
-
# a SRF::DTAGen object
|
|
620
|
-
attr_accessor :version
|
|
621
|
-
attr_accessor :dta_gen
|
|
622
|
-
attr_accessor :enzyme
|
|
623
|
-
attr_accessor :ion_series
|
|
624
|
-
attr_accessor :model
|
|
625
|
-
attr_accessor :modifications
|
|
626
|
-
attr_accessor :raw_filename
|
|
627
|
-
attr_accessor :db_filename
|
|
628
|
-
attr_accessor :dta_log_filename
|
|
629
|
-
attr_accessor :params_filename
|
|
630
|
-
attr_accessor :sequest_log_filename
|
|
631
|
-
|
|
632
|
-
def num_dta_files
|
|
633
|
-
@dta_gen.num_dta_files
|
|
634
|
-
end
|
|
635
|
-
|
|
636
|
-
# sets fh to 0 and grabs the information it wants
|
|
637
|
-
def from_handle(fh)
|
|
638
|
-
st = fh.read(4)
|
|
639
|
-
@version = '3.' + st.unpack('I').first.to_s
|
|
640
|
-
@dta_gen = SRF::DTAGen.new.from_handle(fh)
|
|
641
|
-
|
|
642
|
-
## get the rest of the info
|
|
643
|
-
byte_length = Byte_length.dup
|
|
644
|
-
byte_length.merge! Byte_length_v32 if @version == '3.2'
|
|
645
|
-
|
|
646
|
-
fh.pos = Start_byte[:enzyme]
|
|
647
|
-
[:enzyme, :ion_series, :model, :modifications, :raw_filename, :db_filename, :dta_log_filename, :params_filename, :sequest_log_filename].each do |param|
|
|
648
|
-
send("#{param}=".to_sym, get_null_padded_string(fh, byte_length[param]) )
|
|
649
|
-
end
|
|
650
|
-
self
|
|
651
|
-
end
|
|
652
|
-
|
|
653
|
-
end
|
|
654
|
-
|
|
655
|
-
# the DTA Generation Params
|
|
656
|
-
class SRF::DTAGen
|
|
657
|
-
|
|
658
|
-
## not sure if this is correct
|
|
659
|
-
# Float
|
|
660
|
-
attr_accessor :start_time
|
|
661
|
-
# Float
|
|
662
|
-
attr_accessor :start_mass
|
|
663
|
-
# Float
|
|
664
|
-
attr_accessor :end_mass
|
|
665
|
-
# Integer
|
|
666
|
-
attr_accessor :num_dta_files
|
|
667
|
-
# Integer
|
|
668
|
-
attr_accessor :group_scan
|
|
669
|
-
## not sure if this is correct
|
|
670
|
-
# Integer
|
|
671
|
-
attr_accessor :min_group_count
|
|
672
|
-
# Integer
|
|
673
|
-
attr_accessor :min_ion_threshold
|
|
674
|
-
#attr_accessor :intensity_threshold # can't find yet
|
|
675
|
-
#attr_accessor :precursor_tolerance # can't find yet
|
|
676
|
-
# Integer
|
|
677
|
-
attr_accessor :start_scan
|
|
678
|
-
# Integer
|
|
679
|
-
attr_accessor :end_scan
|
|
680
|
-
|
|
681
|
-
#
|
|
682
|
-
def from_handle(fh)
|
|
683
|
-
fh.pos = 0 if fh.pos != 0
|
|
684
|
-
st = fh.read(148)
|
|
685
|
-
(@start_time, @start_mass, @end_mass, @num_dta_files, @group_scan, @min_group_count, @min_ion_threshold, @start_scan, @end_scan) = st.unpack('x36ex12ex4ex48Ix12IIIII')
|
|
686
|
-
self
|
|
687
|
-
end
|
|
688
|
-
end
|
|
689
|
-
|
|
690
|
-
# total_num_possible_charge_states is not correct under 3.5 (Bioworks 3.3.1)
|
|
691
|
-
# unknown is, well unknown...
|
|
692
|
-
SRF::DTA = Arrayclass.new(%w(mh dta_tic num_peaks charge ms_level unknown total_num_possible_charge_states peaks))
|
|
693
|
-
|
|
694
|
-
class SRF::DTA
|
|
695
|
-
# original
|
|
696
|
-
# Unpack = "EeIvvvv"
|
|
697
|
-
Unpack_32 = "EeIvvvv"
|
|
698
|
-
Unpack_35 = "Ex8eVx2vvvv"
|
|
699
|
-
|
|
700
|
-
# note on peaks (self[7])
|
|
701
|
-
# this is a byte array of floats, you can get the peaks out with
|
|
702
|
-
# unpack("e*")
|
|
703
|
-
|
|
704
|
-
undef_method :inspect
|
|
705
|
-
def inspect
|
|
706
|
-
peaks_st = 'nil'
|
|
707
|
-
if self[7] ; peaks_st = "[#{self[7].size} bytes]" end
|
|
708
|
-
"<SRF::DTA @mh=#{mh} @dta_tic=#{dta_tic} @num_peaks=#{num_peaks} @charge=#{charge} @ms_level=#{ms_level} @total_num_possible_charge_states=#{total_num_possible_charge_states} @peaks=#{peaks_st} >"
|
|
709
|
-
end
|
|
710
|
-
|
|
711
|
-
def from_handle(fh, unpack_35)
|
|
712
|
-
if unpack_35
|
|
713
|
-
@unpack = Unpack_35
|
|
714
|
-
@read_header = 34
|
|
715
|
-
@read_spacer = 22
|
|
716
|
-
else
|
|
717
|
-
@unpack = Unpack_32
|
|
718
|
-
@read_header = 24
|
|
719
|
-
@read_spacer = 24
|
|
720
|
-
end
|
|
721
|
-
|
|
722
|
-
st = fh.read(@read_header)
|
|
723
|
-
# get the bulk of the data in single unpack
|
|
724
|
-
self[0,7] = st.unpack(@unpack)
|
|
725
|
-
|
|
726
|
-
# Scan numbers are given at the end in an index!
|
|
727
|
-
st2 = fh.read(@read_spacer)
|
|
728
|
-
|
|
729
|
-
num_bytes_to_read = num_peaks * 8
|
|
730
|
-
st3 = fh.read(num_bytes_to_read)
|
|
731
|
-
self[7] = st3
|
|
732
|
-
self
|
|
733
|
-
end
|
|
734
|
-
|
|
735
|
-
def to_dta_file_data
|
|
736
|
-
string = "#{mh.round_to(6)} #{charge}\r\n"
|
|
737
|
-
peak_ar = peaks.unpack('e*')
|
|
738
|
-
(0...(peak_ar.size)).step(2) do |i|
|
|
739
|
-
# %d is equivalent to floor, so we round by adding 0.5!
|
|
740
|
-
string << "#{peak_ar[i].round_to(4)} #{(peak_ar[i+1] + 0.5).floor}\r\n"
|
|
741
|
-
#string << peak_ar[i,2].join(' ') << "\r\n"
|
|
742
|
-
end
|
|
743
|
-
string
|
|
744
|
-
end
|
|
745
|
-
|
|
746
|
-
# write a class dta file to the io object
|
|
747
|
-
def write_dta_file(io)
|
|
748
|
-
io.print to_dta_file_data
|
|
749
|
-
end
|
|
750
|
-
|
|
751
|
-
end
|
|
752
|
-
|
|
753
|
-
SRF::OUT = Arrayclass.new( %w(first_scan last_scan charge num_hits computer date_time hits total_inten lowest_sp num_matched_peptides db_locus_count) )
|
|
754
|
-
# 0=first_scan, 1=last_scan, 2=charge, 3=num_hits, 4=computer, 5=date_time, 6=hits, 7=total_inten, 8=lowest_sp, 9=num_matched_peptides, 10=db_locus_count
|
|
755
|
-
|
|
756
|
-
class SRF::OUT
|
|
757
|
-
Unpack_32 = '@36vx2Z*@60Z*'
|
|
758
|
-
Unpack_35 = '@36vx4Z*@62Z*'
|
|
759
|
-
|
|
760
|
-
undef_method :inspect
|
|
761
|
-
def inspect
|
|
762
|
-
hits_s =
|
|
763
|
-
if self[6]
|
|
764
|
-
", @hits(#)=#{hits.size}"
|
|
765
|
-
else
|
|
766
|
-
''
|
|
767
|
-
end
|
|
768
|
-
"<SRF::OUT first_scan=#{first_scan}, last_scan=#{last_scan}, charge=#{charge}, num_hits=#{num_hits}, computer=#{computer}, date_time=#{date_time}#{hits_s}>"
|
|
769
|
-
end
|
|
770
|
-
|
|
771
|
-
def from_handle(fh, global_ref_hash, unpack_35)
|
|
772
|
-
## EMPTY out file is 96 bytes
|
|
773
|
-
## each hit is 320 bytes
|
|
774
|
-
## num_hits and charge:
|
|
775
|
-
st = fh.read(96)
|
|
776
|
-
|
|
777
|
-
self[3,3] = st.unpack( (unpack_35 ? Unpack_35 : Unpack_32) )
|
|
778
|
-
self[7,4] = st.unpack('@8eex4Ix4I')
|
|
779
|
-
num_hits = self[3]
|
|
780
|
-
|
|
781
|
-
ar = Array.new(num_hits)
|
|
782
|
-
if ar.size > 0
|
|
783
|
-
num_extra_references = 0
|
|
784
|
-
num_hits.times do |i|
|
|
785
|
-
ar[i] = SRF::OUT::Pep.new.from_handle(fh, global_ref_hash, unpack_35)
|
|
786
|
-
num_extra_references += ar[i].num_other_loci
|
|
787
|
-
end
|
|
788
|
-
SRF::OUT::Pep.read_extra_references(fh, num_extra_references, ar, global_ref_hash)
|
|
789
|
-
## The xcorrs are already ordered by best to worst hit
|
|
790
|
-
## ADJUST the deltacn's to be meaningful for the top hit:
|
|
791
|
-
## (the same as bioworks and prophet)
|
|
792
|
-
SRF::OUT::Pep.set_deltacn_from_deltacn_orig(ar)
|
|
793
|
-
#puts ar.map {|a| a.deltacn }.join(", ")
|
|
794
|
-
end
|
|
795
|
-
self[6] = ar
|
|
796
|
-
self
|
|
797
|
-
end
|
|
798
|
-
|
|
799
|
-
|
|
800
|
-
|
|
801
|
-
end
|
|
802
|
-
|
|
803
|
-
|
|
804
|
-
# deltacn_orig - the one that sequest originally reports (top hit gets 0.0)
|
|
805
|
-
# deltacn - modified to be that of the next best hit (by xcorr) and the last
|
|
806
|
-
# hit takes 1.1. This is what is called deltacn by bioworks and pepprophet
|
|
807
|
-
# (at least for the first few years). If filtering occurs, it will be
|
|
808
|
-
# updated.
|
|
809
|
-
# deltacn_orig_updated - the latest updated value of deltacn.
|
|
810
|
-
# Originally, this will be equal to deltacn_orig. After filtering, this will
|
|
811
|
-
# be recalculated. To know if this will be different from deltacn_orig, query
|
|
812
|
-
# match.srf.filtered_by_precursor_mass_tolerance. If this is changed, then
|
|
813
|
-
# deltacn should also be changed to reflect it.
|
|
814
|
-
# mh - the theoretical mass + h
|
|
815
|
-
# prots are created as SRF prot objects with a reference and linked to their
|
|
816
|
-
# peptides (from global hash by reference)
|
|
817
|
-
# ppm = 10^6 * ∆m_accuracy / mass_measured [ where ∆m_accuracy = mass_real – mass_measured ]
|
|
818
|
-
# This is calculated for the M+H mass!
|
|
819
|
-
# num_other_loci is the number of other loci that the peptide matches beyond
|
|
820
|
-
# the first one listed
|
|
821
|
-
# srf = the srf object this scan came from
|
|
822
|
-
|
|
823
|
-
SRF::OUT::Pep = Arrayclass.new(%w( mh deltacn_orig sp xcorr id num_other_loci rsp ions_matched ions_total sequence prots deltamass ppm aaseq base_name first_scan last_scan charge srf deltacn deltacn_orig_updated) )
|
|
824
|
-
|
|
825
|
-
# 0=mh 1=deltacn_orig 2=sp 3=xcorr 4=id 5=num_other_loci 6=rsp 7=ions_matched 8=ions_total 9=sequence 10=prots 11=deltamass 12=ppm 13=aaseq 14=base_name 15=first_scan 16=last_scan 17=charge 18=srf 19=deltacn 20=deltacn_orig_updated
|
|
826
|
-
|
|
827
|
-
class SRF::OUT::Pep
|
|
828
|
-
include SpecID::Pep
|
|
829
|
-
|
|
830
|
-
# creates the deltacn that is meaningful for the top hit (the deltacn_orig
|
|
831
|
-
# or the second best hit and so on).
|
|
832
|
-
# assumes sorted
|
|
833
|
-
def self.set_deltacn_from_deltacn_orig(ar)
|
|
834
|
-
(1...ar.size).each {|i| ar[i-1].deltacn = ar[i].deltacn_orig }
|
|
835
|
-
ar[-1].deltacn = 1.1
|
|
836
|
-
end
|
|
837
|
-
|
|
838
|
-
# (assumes sorted)
|
|
839
|
-
# recalculates deltacn from xcorrs and sets deltacn_orig_updated and deltacn
|
|
840
|
-
def self.update_deltacns_from_xcorr(ar)
|
|
841
|
-
if ar.size > 0
|
|
842
|
-
top_score = ar.first[3]
|
|
843
|
-
other_scores = (1...(ar.size)).to_a.map do |i|
|
|
844
|
-
1.0 - (ar[i][3]/top_score)
|
|
845
|
-
end
|
|
846
|
-
ar.first[20] = 0.0
|
|
847
|
-
(0...(ar.size-1)).each do |i|
|
|
848
|
-
ar[i][19] = other_scores[i] # deltacn
|
|
849
|
-
ar[i+1][20] = other_scores[i] # deltacn_orig_updated
|
|
850
|
-
end
|
|
851
|
-
ar.last[19] = 1.1
|
|
852
|
-
end
|
|
853
|
-
end
|
|
854
|
-
|
|
855
|
-
def self.read_extra_references(fh, num_extra_references, pep_hits, global_ref_hash)
|
|
856
|
-
num_extra_references.times do
|
|
857
|
-
# 80 bytes total (with index number)
|
|
858
|
-
pep = pep_hits[fh.read(8).unpack('x4I').first - 1]
|
|
859
|
-
|
|
860
|
-
ref = fh.read(80).unpack('A*').first
|
|
861
|
-
pep[10] << pep.new_protein(ref[0,38], pep, global_ref_hash)
|
|
862
|
-
end
|
|
863
|
-
# fh.read(6) if unpack_35
|
|
864
|
-
end
|
|
865
|
-
|
|
866
|
-
# x2=???
|
|
867
|
-
#Unpack_35 = '@64Ex8ex12eeIx22vx2vvx8Z*@246Z*'
|
|
868
|
-
### NOTE:
|
|
869
|
-
# I need to verify that this is correct (I mean the 'I' after x18)
|
|
870
|
-
Unpack_35 = '@64Ex8ex12eeIx18Ivx2vvx8Z*@246Z*'
|
|
871
|
-
# translation: @64=(64 bytes in to the record), E=mH, x8=8unknown bytes, e=deltacn,
|
|
872
|
-
# x12=12unknown bytes, e=sp, e=xcorr, I=ID#, x18=18 unknown bytes, v=rsp,
|
|
873
|
-
# v=ions_matched, v=ions_total, x8=8unknown bytes, Z*=sequence, 240Z*=at
|
|
874
|
-
# byte 240 grab the string (which is proteins).
|
|
875
|
-
#Unpack_32 = '@64Ex8ex12eeIx18vvvx8Z*@240Z*'
|
|
876
|
-
Unpack_32 = '@64Ex8ex12eeIx14Ivvvx8Z*@240Z*'
|
|
877
|
-
Unpack_four_null_bytes = 'a*'
|
|
878
|
-
Unpack_Zstar = 'Z*'
|
|
879
|
-
Read_35 = 426
|
|
880
|
-
Read_32 = 320
|
|
881
|
-
|
|
882
|
-
FourNullBytes_as_string = "\0\0\0\0"
|
|
883
|
-
#NewRecordStart = "\0\0" + 0x3a.chr + 0x1a.chr + "\0\0"
|
|
884
|
-
NewRecordStart = 0x01.chr + 0x00.chr
|
|
885
|
-
Sequest_record_start = "[SEQUEST]"
|
|
886
|
-
|
|
887
|
-
undef_method :inspect
|
|
888
|
-
def inspect
|
|
889
|
-
st = %w(aaseq sequence mh deltacn_orig sp xcorr id rsp ions_matched ions_total prots deltamass ppm base_name first_scan last_scan charge deltacn).map do |v|
|
|
890
|
-
if v == 'prots'
|
|
891
|
-
"#{v}(#)=#{send(v.to_sym).size}"
|
|
892
|
-
elsif v.is_a? Array
|
|
893
|
-
"##{v}=#{send(v.to_sym).size}"
|
|
894
|
-
else
|
|
895
|
-
"#{v}=#{send(v.to_sym).inspect}"
|
|
896
|
-
end
|
|
897
|
-
end
|
|
898
|
-
st.unshift("<#{self.class}")
|
|
899
|
-
if srf
|
|
900
|
-
st.push("srf(base_name)=#{srf.base_name.inspect}")
|
|
901
|
-
end
|
|
902
|
-
st.push('>')
|
|
903
|
-
st.join(' ')
|
|
904
|
-
#"<SRF::OUT::Pep @mh=#{mh}, @deltacn=#{deltacn}, @sp=#{sp}, @xcorr=#{xcorr}, @id=#{id}, @rsp=#{rsp}, @ions_matched=#{ions_matched}, @ions_total=#{ions_total}, @sequence=#{sequence}, @prots(count)=#{prots.size}, @deltamass=#{deltamass}, @ppm=#{ppm} @aaseq=#{aaseq}, @base_name=#{base_name}, @first_scan=#{first_scan}, @last_scan=#{last_scan}, @charge=#{charge}, @srf(base_name)=#{srf.base_name}>"
|
|
905
|
-
end
|
|
906
|
-
# extra_references_array is an array that grows with peptides as extra
|
|
907
|
-
# references are discovered.
|
|
908
|
-
def from_handle(fh, global_ref_hash, unpack_35)
|
|
909
|
-
unpack =
|
|
910
|
-
if unpack_35 ; Unpack_35
|
|
911
|
-
else ; Unpack_32
|
|
912
|
-
end
|
|
913
|
-
|
|
914
|
-
## get the first part of the info
|
|
915
|
-
st = fh.read(( unpack_35 ? Read_35 : Read_32) ) ## read all the hit data
|
|
916
|
-
|
|
917
|
-
self[0,10] = st.unpack(unpack)
|
|
918
|
-
|
|
919
|
-
# set deltacn_orig_updated
|
|
920
|
-
self[20] = self[1]
|
|
921
|
-
|
|
922
|
-
# we are slicing the reference to 38 chars to be the same length as
|
|
923
|
-
# duplicate references
|
|
924
|
-
self[10] = [new_protein(self[10][0,38], self, global_ref_hash)]
|
|
925
|
-
|
|
926
|
-
self[13] = SpecID::Pep.sequence_to_aaseq(self[9])
|
|
927
|
-
|
|
928
|
-
fh.read(6) if unpack_35
|
|
929
|
-
|
|
930
|
-
self
|
|
931
|
-
end
|
|
932
|
-
|
|
933
|
-
def new_protein(reference, peptide, global_ref_hash)
|
|
934
|
-
if global_ref_hash.key? reference
|
|
935
|
-
global_ref_hash[reference].peps << peptide
|
|
936
|
-
else
|
|
937
|
-
global_ref_hash[reference] = SRF::OUT::Prot.new(reference, [peptide])
|
|
938
|
-
end
|
|
939
|
-
global_ref_hash[reference]
|
|
940
|
-
end
|
|
941
|
-
|
|
942
|
-
end
|
|
943
|
-
|
|
944
|
-
SRF::OUT::Prot = Arrayclass.new( %w(reference peps) )
|
|
945
|
-
|
|
946
|
-
class SRF::OUT::Prot
|
|
947
|
-
include SpecID::Prot
|
|
948
|
-
# we shouldn't have to do this because this is inlcuded in SpecID::Prot, but
|
|
949
|
-
# under some circumstances it won't work without explicitly calling it.
|
|
950
|
-
include ProteinReferenceable
|
|
951
|
-
|
|
952
|
-
tmp = $VERBOSE ; $VERBOSE = nil
|
|
953
|
-
def initialize(reference=nil, peps=[])
|
|
954
|
-
#super(@@arr_size)
|
|
955
|
-
super(self.class.size)
|
|
956
|
-
#@reference = reference
|
|
957
|
-
#@peps = peps
|
|
958
|
-
self[0,2] = reference, peps
|
|
959
|
-
end
|
|
960
|
-
$VERBOSE = tmp
|
|
961
|
-
|
|
962
|
-
# "<SRF::OUT::Prot reference=\"#{@reference}\">"
|
|
963
|
-
|
|
964
|
-
undef_method :inspect
|
|
965
|
-
def inspect
|
|
966
|
-
"<SRF::OUT::Prot @reference=#{reference}, @peps(#)=#{peps.size}>"
|
|
967
|
-
end
|
|
968
|
-
end
|
|
969
|
-
|
|
970
|
-
|
|
971
|
-
|
|
972
|
-
|
|
973
|
-
|