mspire 0.2.4 → 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/INSTALL +1 -0
- data/README +25 -0
- data/Rakefile +129 -40
- data/bin/{find_aa_freq.rb → aafreqs.rb} +2 -2
- data/bin/bioworks_to_pepxml.rb +1 -0
- data/bin/fasta_shaker.rb +1 -96
- data/bin/filter_and_validate.rb +5 -0
- data/bin/{mzxml_to_lmat.rb → ms_to_lmat.rb} +8 -7
- data/bin/prob_validate.rb +6 -0
- data/bin/raw_to_mzXML.rb +2 -2
- data/bin/srf_group.rb +1 -0
- data/bin/srf_to_sqt.rb +40 -0
- data/changelog.txt +68 -0
- data/lib/align/chams.rb +6 -6
- data/lib/align.rb +4 -3
- data/lib/bsearch.rb +120 -0
- data/lib/fasta.rb +318 -86
- data/lib/group_by.rb +10 -0
- data/lib/index_by.rb +11 -0
- data/lib/merge_deep.rb +21 -0
- data/lib/{spec → ms/converter}/mzxml.rb +77 -109
- data/lib/ms/gradient_program.rb +171 -0
- data/lib/ms/msrun.rb +209 -0
- data/lib/{spec/msrun.rb → ms/msrun_index.rb} +7 -40
- data/lib/ms/parser/mzdata/axml.rb +12 -0
- data/lib/ms/parser/mzdata/dom.rb +160 -0
- data/lib/ms/parser/mzdata/libxml.rb +7 -0
- data/lib/ms/parser/mzdata.rb +25 -0
- data/lib/ms/parser/mzxml/axml.rb +11 -0
- data/lib/ms/parser/mzxml/dom.rb +159 -0
- data/lib/ms/parser/mzxml/hpricot.rb +253 -0
- data/lib/ms/parser/mzxml/libxml.rb +15 -0
- data/lib/ms/parser/mzxml/regexp.rb +122 -0
- data/lib/ms/parser/mzxml/rexml.rb +72 -0
- data/lib/ms/parser/mzxml/xmlparser.rb +248 -0
- data/lib/ms/parser/mzxml.rb +175 -0
- data/lib/ms/parser.rb +108 -0
- data/lib/ms/precursor.rb +10 -0
- data/lib/ms/scan.rb +81 -0
- data/lib/ms/spectrum.rb +193 -0
- data/lib/ms.rb +10 -0
- data/lib/mspire.rb +4 -0
- data/lib/roc.rb +61 -1
- data/lib/sample_enzyme.rb +31 -8
- data/lib/scan_i.rb +21 -0
- data/lib/spec_id/aa_freqs.rb +7 -3
- data/lib/spec_id/bioworks.rb +20 -14
- data/lib/spec_id/digestor.rb +139 -0
- data/lib/spec_id/mass.rb +116 -0
- data/lib/spec_id/parser/proph.rb +236 -0
- data/lib/spec_id/precision/filter/cmdline.rb +209 -0
- data/lib/spec_id/precision/filter/interactive.rb +134 -0
- data/lib/spec_id/precision/filter/output.rb +147 -0
- data/lib/spec_id/precision/filter.rb +623 -0
- data/lib/spec_id/precision/output.rb +60 -0
- data/lib/spec_id/precision/prob/cmdline.rb +139 -0
- data/lib/spec_id/precision/prob/output.rb +88 -0
- data/lib/spec_id/precision/prob.rb +171 -0
- data/lib/spec_id/proph/pep_summary.rb +92 -0
- data/lib/spec_id/proph/prot_summary.rb +484 -0
- data/lib/spec_id/proph.rb +2 -466
- data/lib/spec_id/protein_summary.rb +2 -2
- data/lib/spec_id/sequest/params.rb +316 -0
- data/lib/spec_id/sequest/pepxml.rb +1513 -0
- data/lib/spec_id/sequest.rb +2 -1672
- data/lib/spec_id/srf.rb +445 -177
- data/lib/spec_id.rb +183 -95
- data/lib/spec_id_xml.rb +8 -10
- data/lib/transmem/phobius.rb +147 -0
- data/lib/transmem/toppred.rb +368 -0
- data/lib/transmem.rb +157 -0
- data/lib/validator/aa.rb +135 -0
- data/lib/validator/background.rb +73 -0
- data/lib/validator/bias.rb +95 -0
- data/lib/validator/cmdline.rb +260 -0
- data/lib/validator/decoy.rb +94 -0
- data/lib/validator/digestion_based.rb +69 -0
- data/lib/validator/probability.rb +48 -0
- data/lib/validator/prot_from_pep.rb +234 -0
- data/lib/validator/transmem.rb +272 -0
- data/lib/validator/true_pos.rb +46 -0
- data/lib/validator.rb +214 -0
- data/lib/xml.rb +38 -0
- data/lib/xml_style_parser.rb +105 -0
- data/lib/xmlparser_wrapper.rb +19 -0
- data/script/compile_and_plot_smriti_final.rb +97 -0
- data/script/extract_gradient_programs.rb +56 -0
- data/script/get_apex_values_rexml.rb +44 -0
- data/script/mzXML2timeIndex.rb +1 -1
- data/script/smriti_final_analysis.rb +103 -0
- data/script/toppred_to_yaml.rb +47 -0
- data/script/tpp_installer.rb +1 -1
- data/{test/tc_align.rb → specs/align_spec.rb} +21 -27
- data/{test/tc_bioworks_to_pepxml.rb → specs/bin/bioworks_to_pepxml_spec.rb} +25 -41
- data/specs/bin/fasta_shaker_spec.rb +259 -0
- data/specs/bin/filter_and_validate__multiple_vals_helper.yaml +202 -0
- data/specs/bin/filter_and_validate_spec.rb +124 -0
- data/specs/bin/ms_to_lmat_spec.rb +34 -0
- data/specs/bin/prob_validate_spec.rb +62 -0
- data/specs/bin/protein_summary_spec.rb +10 -0
- data/{test/tc_fasta.rb → specs/fasta_spec.rb} +354 -310
- data/specs/gi_spec.rb +22 -0
- data/specs/load_bin_path.rb +7 -0
- data/specs/merge_deep_spec.rb +13 -0
- data/specs/ms/gradient_program_spec.rb +77 -0
- data/specs/ms/msrun_spec.rb +455 -0
- data/specs/ms/parser_spec.rb +92 -0
- data/specs/ms/spectrum_spec.rb +89 -0
- data/specs/roc_spec.rb +251 -0
- data/specs/rspec_autotest.rb +149 -0
- data/specs/sample_enzyme_spec.rb +41 -0
- data/specs/spec_helper.rb +133 -0
- data/specs/spec_id/aa_freqs_spec.rb +52 -0
- data/{test/tc_bioworks.rb → specs/spec_id/bioworks_spec.rb} +56 -71
- data/specs/spec_id/digestor_spec.rb +75 -0
- data/specs/spec_id/precision/filter/cmdline_spec.rb +20 -0
- data/specs/spec_id/precision/filter/output_spec.rb +31 -0
- data/specs/spec_id/precision/filter_spec.rb +243 -0
- data/specs/spec_id/precision/prob_spec.rb +111 -0
- data/specs/spec_id/precision/prob_spec_helper.rb +0 -0
- data/specs/spec_id/proph/pep_summary_spec.rb +143 -0
- data/{test/tc_proph.rb → specs/spec_id/proph/prot_summary_spec.rb} +52 -32
- data/{test/tc_protein_summary.rb → specs/spec_id/protein_summary_spec.rb} +85 -0
- data/specs/spec_id/sequest/params_spec.rb +68 -0
- data/specs/spec_id/sequest/pepxml_spec.rb +452 -0
- data/specs/spec_id/sqt_spec.rb +138 -0
- data/specs/spec_id/srf_spec.rb +209 -0
- data/specs/spec_id/srf_spec_helper.rb +302 -0
- data/specs/spec_id_helper.rb +33 -0
- data/specs/spec_id_spec.rb +361 -0
- data/specs/spec_id_xml_spec.rb +33 -0
- data/specs/transmem/phobius_spec.rb +423 -0
- data/specs/transmem/toppred_spec.rb +297 -0
- data/specs/transmem_spec.rb +60 -0
- data/specs/transmem_spec_shared.rb +64 -0
- data/specs/validator/aa_spec.rb +107 -0
- data/specs/validator/background_spec.rb +51 -0
- data/specs/validator/bias_spec.rb +146 -0
- data/specs/validator/decoy_spec.rb +51 -0
- data/specs/validator/fasta_helper.rb +26 -0
- data/specs/validator/prot_from_pep_spec.rb +141 -0
- data/specs/validator/transmem_spec.rb +145 -0
- data/specs/validator/true_pos_spec.rb +58 -0
- data/specs/validator_helper.rb +33 -0
- data/specs/xml_spec.rb +12 -0
- data/test_files/000_pepxml18_small.xml +206 -0
- data/test_files/020a.mzXML.timeIndex +4710 -0
- data/test_files/4-03-03_mzXML/000.mzXML.timeIndex +3973 -0
- data/test_files/4-03-03_mzXML/020.mzXML.timeIndex +3872 -0
- data/test_files/4-03-03_small-prot.xml +321 -0
- data/test_files/4-03-03_small.xml +3876 -0
- data/test_files/7MIX_STD_110802_1.sequest_params_fragment.srf +0 -0
- data/test_files/bioworks-3.3_10prots.xml +5999 -0
- data/test_files/bioworks31.params +77 -0
- data/test_files/bioworks32.params +62 -0
- data/test_files/bioworks33.params +63 -0
- data/test_files/bioworks_single_run_small.xml +7237 -0
- data/test_files/bioworks_small.fasta +212 -0
- data/test_files/bioworks_small.params +63 -0
- data/test_files/bioworks_small.phobius +109 -0
- data/test_files/bioworks_small.toppred.out +2847 -0
- data/test_files/bioworks_small.xml +5610 -0
- data/test_files/bioworks_with_INV_small.xml +3753 -0
- data/test_files/bioworks_with_SHUFF_small.xml +2503 -0
- data/test_files/corrupted_900.srf +0 -0
- data/test_files/head_of_7MIX.srf +0 -0
- data/test_files/interact-opd1_mods_small-prot.xml +304 -0
- data/test_files/messups.fasta +297 -0
- data/test_files/opd1/000.my_answer.100lines.xml +101 -0
- data/test_files/opd1/000.tpp_1.2.3.first10.xml +115 -0
- data/test_files/opd1/000.tpp_2.9.2.first10.xml +126 -0
- data/test_files/opd1/000.v2.1.mzXML.timeIndex +3748 -0
- data/test_files/opd1/000_020-prot.png +0 -0
- data/test_files/opd1/000_020_3prots-prot.mod_initprob.xml +62 -0
- data/test_files/opd1/000_020_3prots-prot.xml +62 -0
- data/test_files/opd1/opd1_cat_inv_small-prot.xml +139 -0
- data/test_files/opd1/sequest.3.1.params +77 -0
- data/test_files/opd1/sequest.3.2.params +62 -0
- data/test_files/opd1/twenty_scans.mzXML +418 -0
- data/test_files/opd1/twenty_scans.v2.1.mzXML +382 -0
- data/test_files/opd1/twenty_scans_answ.lmat +0 -0
- data/test_files/opd1/twenty_scans_answ.lmata +9 -0
- data/test_files/opd1_020_beginning.RAW +0 -0
- data/test_files/opd1_2runs_2mods/interact-opd1_mods__small.xml +753 -0
- data/test_files/orbitrap_mzData/000_cut.xml +1920 -0
- data/test_files/pepproph_small.xml +4691 -0
- data/test_files/phobius.small.noheader.txt +50 -0
- data/test_files/phobius.small.small.txt +53 -0
- data/test_files/s01_anC1_ld020mM.key.txt +25 -0
- data/test_files/s01_anC1_ld020mM.meth +0 -0
- data/test_files/small.fasta +297 -0
- data/test_files/smallraw.RAW +0 -0
- data/test_files/tf_bioworks2excel.bioXML +14340 -0
- data/test_files/tf_bioworks2excel.txt.actual +1035 -0
- data/test_files/toppred.small.out +416 -0
- data/test_files/toppred.xml.out +318 -0
- data/test_files/validator_hits_separate/bias_bioworks_small_HS.fasta +7 -0
- data/test_files/validator_hits_separate/bioworks_small_HS.xml +5651 -0
- data/test_files/yeast_gly_small-prot.xml +265 -0
- data/test_files/yeast_gly_small.1.0_1.0_1.0.parentTimes +6 -0
- data/test_files/yeast_gly_small.xml +3807 -0
- data/test_files/yeast_gly_small2.parentTimes +6 -0
- metadata +273 -57
- data/bin/filter.rb +0 -6
- data/bin/precision.rb +0 -5
- data/lib/spec/mzdata/parser.rb +0 -108
- data/lib/spec/mzdata.rb +0 -48
- data/lib/spec/mzxml/parser.rb +0 -449
- data/lib/spec/scan.rb +0 -55
- data/lib/spec_id/filter.rb +0 -797
- data/lib/spec_id/precision.rb +0 -421
- data/lib/toppred.rb +0 -18
- data/script/filter-peps.rb +0 -164
- data/test/tc_aa_freqs.rb +0 -59
- data/test/tc_fasta_shaker.rb +0 -149
- data/test/tc_filter.rb +0 -203
- data/test/tc_filter_peps.rb +0 -46
- data/test/tc_gi.rb +0 -17
- data/test/tc_id_class_anal.rb +0 -70
- data/test/tc_id_precision.rb +0 -89
- data/test/tc_msrun.rb +0 -88
- data/test/tc_mzxml.rb +0 -88
- data/test/tc_mzxml_to_lmat.rb +0 -36
- data/test/tc_peptide_parent_times.rb +0 -27
- data/test/tc_precision.rb +0 -60
- data/test/tc_roc.rb +0 -166
- data/test/tc_sample_enzyme.rb +0 -32
- data/test/tc_scan.rb +0 -26
- data/test/tc_sequest.rb +0 -336
- data/test/tc_spec.rb +0 -78
- data/test/tc_spec_id.rb +0 -201
- data/test/tc_spec_id_xml.rb +0 -36
- data/test/tc_srf.rb +0 -262
|
@@ -0,0 +1,122 @@
|
|
|
1
|
+
require 'strscan'
|
|
2
|
+
|
|
3
|
+
module MS::Parser::MzXML ; end
|
|
4
|
+
|
|
5
|
+
class MS::Parser::MzXML::Regexp
|
|
6
|
+
@@first_scan_regexp = /<scan /o
|
|
7
|
+
include MS::Parser::MzXML
|
|
8
|
+
|
|
9
|
+
def initialize(method=:msrun, version='1.0')
|
|
10
|
+
@method = method
|
|
11
|
+
@version = version
|
|
12
|
+
end
|
|
13
|
+
|
|
14
|
+
def parse(file)
|
|
15
|
+
send(@method, file)
|
|
16
|
+
end
|
|
17
|
+
|
|
18
|
+
# returns a MS::MsRun Object
|
|
19
|
+
def msrun(file)
|
|
20
|
+
fh = File.open(file)
|
|
21
|
+
get_header(fh)
|
|
22
|
+
|
|
23
|
+
fh.close
|
|
24
|
+
end
|
|
25
|
+
|
|
26
|
+
#def msrun(file, opts={})
|
|
27
|
+
#end
|
|
28
|
+
|
|
29
|
+
@@scan_re = /<scan.*?num="(\d+)"(.*?)<\/scan>/mo
|
|
30
|
+
def self.precursor_mz_and_intensity_by_scan(file)
|
|
31
|
+
prec_re = /msLevel="2".*?<precursorMz precursorIntensity="([\d\.]+)".*?>([\d\.]+)<\/precursorMz>/mo
|
|
32
|
+
self.by_scan_num(file, prec_re) {|match_obj| match_obj.captures.reverse}
|
|
33
|
+
end
|
|
34
|
+
|
|
35
|
+
# (array will likely start at 1!)
|
|
36
|
+
def self.by_scan_num(file, regex)
|
|
37
|
+
arr = []
|
|
38
|
+
File.open(file) do |fh|
|
|
39
|
+
string = fh.read
|
|
40
|
+
matches = string.scan(@@scan_re)
|
|
41
|
+
matches.each do |matched|
|
|
42
|
+
if inner_match = regex.match(matched[1])
|
|
43
|
+
index = matched[0].to_i
|
|
44
|
+
arr[index] = yield(inner_match)
|
|
45
|
+
end
|
|
46
|
+
end
|
|
47
|
+
end
|
|
48
|
+
arr
|
|
49
|
+
end
|
|
50
|
+
|
|
51
|
+
# Returns array where array[scan_num] = precursorMz
|
|
52
|
+
# Parent scans armme not arrayed
|
|
53
|
+
# Values are strings. Array index likely starts at 1!
|
|
54
|
+
# @TODO: replace the use of a yield block
|
|
55
|
+
def self.precursor_mz_by_scan(file)
|
|
56
|
+
prec_re = /msLevel="2".*?<precursorMz.*?>([\d\.]+)<\/precursorMz>/mo
|
|
57
|
+
self.by_scan_num(file, prec_re) {|match_obj| match_obj.captures[0]}
|
|
58
|
+
end
|
|
59
|
+
|
|
60
|
+
end
|
|
61
|
+
|
|
62
|
+
|
|
63
|
+
class MS::Parser::MzXML::Regexp::MsRun
|
|
64
|
+
@@scan_count_regexp = /scanCount="(\d+)"/o
|
|
65
|
+
@@start_time_regexp = /startTime="PT([\d\.]+)S"/o
|
|
66
|
+
@@end_time_regexp = /endTime="PT([\d\.]+)S"/o
|
|
67
|
+
@@first_scan_regexp = /<scan /
|
|
68
|
+
|
|
69
|
+
def initialize(version='1.0')
|
|
70
|
+
@version = version
|
|
71
|
+
end
|
|
72
|
+
|
|
73
|
+
def parse(io, msrun_object)
|
|
74
|
+
atts = {}
|
|
75
|
+
[:scan_count, :start_time, :end_time].zip(get_header_info(io)) {|v,k| atts[k] = v }
|
|
76
|
+
###
|
|
77
|
+
# HERE <------------------------------------
|
|
78
|
+
abort "NEED TO FINISH WRITING SCANS EXTRACTOR!"
|
|
79
|
+
get_scans(io)
|
|
80
|
+
# HERE <------------------------------------
|
|
81
|
+
|
|
82
|
+
# set the attributes
|
|
83
|
+
atts.each do |k,v|
|
|
84
|
+
msrun_object.send(k,v)
|
|
85
|
+
end
|
|
86
|
+
# need to fill in the scan_counts array
|
|
87
|
+
end
|
|
88
|
+
|
|
89
|
+
# assumes the attributes are each on a line
|
|
90
|
+
def get_scans(io)
|
|
91
|
+
io.each do |line|
|
|
92
|
+
end
|
|
93
|
+
end
|
|
94
|
+
|
|
95
|
+
# returns [total_num_scans, start_time, end_time] and positions the handle
|
|
96
|
+
# so that the next 'gets' will call a scan
|
|
97
|
+
def get_header_info(io)
|
|
98
|
+
scan_count = nil
|
|
99
|
+
start_time = nil
|
|
100
|
+
end_time = nil
|
|
101
|
+
|
|
102
|
+
previous_position = nil
|
|
103
|
+
io.each do |line|
|
|
104
|
+
if line =~ @@scan_count_regexp
|
|
105
|
+
scan_count = $1.dup
|
|
106
|
+
end
|
|
107
|
+
if line =~ @@start_time_regexp
|
|
108
|
+
start_time = $1.dup
|
|
109
|
+
end
|
|
110
|
+
if line =~ @@end_time_regexp
|
|
111
|
+
end_time = $1.dup
|
|
112
|
+
end
|
|
113
|
+
if line =~ @@first_scan_regexp
|
|
114
|
+
io.pos = previous_position
|
|
115
|
+
break
|
|
116
|
+
end
|
|
117
|
+
previous_position = io.pos
|
|
118
|
+
end
|
|
119
|
+
[scan_count, start_time, end_time]
|
|
120
|
+
end
|
|
121
|
+
|
|
122
|
+
end
|
|
@@ -0,0 +1,72 @@
|
|
|
1
|
+
require 'rexml/document'
|
|
2
|
+
require 'rexml/streamlistener'
|
|
3
|
+
|
|
4
|
+
module MS::Parser::MzXML::REXMLStreamListener; end
|
|
5
|
+
class MS::Parser::MzXML::REXMLStreamListener::PrecMzByNum; end
|
|
6
|
+
|
|
7
|
+
module REXMLStreamListenerHelper
|
|
8
|
+
def parse_and_report(file, const, report_method=:report)
|
|
9
|
+
listener = self.const_get(const).new
|
|
10
|
+
File.open(file) do |fh|
|
|
11
|
+
REXML::Document.parse_stream(fh, listener)
|
|
12
|
+
end
|
|
13
|
+
listener.send(report_method)
|
|
14
|
+
end
|
|
15
|
+
end
|
|
16
|
+
|
|
17
|
+
class MS::Parser::MzXML::REXML
|
|
18
|
+
include MS::Parser::MzXML
|
|
19
|
+
|
|
20
|
+
def initialize(version='1.0', method=:msrun)
|
|
21
|
+
@version = version
|
|
22
|
+
@method = parse_type
|
|
23
|
+
end
|
|
24
|
+
|
|
25
|
+
# returns an array indexed by scan_num that gives the precursor_mz
|
|
26
|
+
def precursor_mz_by_scan(file, opts={})
|
|
27
|
+
parse_and_report(file, PrecMzByNum)
|
|
28
|
+
end
|
|
29
|
+
|
|
30
|
+
end
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
# for REXML
|
|
36
|
+
class MS::Parser::MzXML::REXML::PrecMzByNum
|
|
37
|
+
include REXML::StreamListener
|
|
38
|
+
|
|
39
|
+
attr_accessor :prec_mz
|
|
40
|
+
alias_method :report, :prec_mz
|
|
41
|
+
|
|
42
|
+
def initialize
|
|
43
|
+
@prec_mz = []
|
|
44
|
+
@scan_num = nil
|
|
45
|
+
@get_data = false
|
|
46
|
+
end
|
|
47
|
+
|
|
48
|
+
def tag_start(name,attrs)
|
|
49
|
+
if name == "scan"
|
|
50
|
+
@scan_num = attrs["num"].to_i
|
|
51
|
+
elsif name == "precursorMz"
|
|
52
|
+
@get_data = true
|
|
53
|
+
end
|
|
54
|
+
end
|
|
55
|
+
|
|
56
|
+
def tag_end(name)
|
|
57
|
+
if name == "precursorMz"
|
|
58
|
+
@get_data = false
|
|
59
|
+
end
|
|
60
|
+
end
|
|
61
|
+
|
|
62
|
+
def text(txt)
|
|
63
|
+
if @get_data
|
|
64
|
+
@prec_mz[@scan_num] = txt
|
|
65
|
+
end
|
|
66
|
+
end
|
|
67
|
+
|
|
68
|
+
end
|
|
69
|
+
|
|
70
|
+
|
|
71
|
+
|
|
72
|
+
|
|
@@ -0,0 +1,248 @@
|
|
|
1
|
+
require 'xmlparser_wrapper'
|
|
2
|
+
|
|
3
|
+
# this is the wrapper class
|
|
4
|
+
class MS::Parser::MzXML::XMLParser
|
|
5
|
+
include XMLStyleParser
|
|
6
|
+
include MS::Parser::MzXML
|
|
7
|
+
include XMLParserWrapper
|
|
8
|
+
|
|
9
|
+
def initialize(parse_type=:msrun, version='1.0')
|
|
10
|
+
@method = parse_type
|
|
11
|
+
@version = version
|
|
12
|
+
end
|
|
13
|
+
|
|
14
|
+
# returns: [times_arr, [m/z,inten,m/z,inten...]]
|
|
15
|
+
# where times are time strings (in seconds)
|
|
16
|
+
def times_and_spectra(file, opts={})
|
|
17
|
+
parse_and_report(file, 'TimesAndSpectra')
|
|
18
|
+
end
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
## IN PROGRESS ...
|
|
22
|
+
# opts is actually the msrun object that will be fleshed out in the parsing
|
|
23
|
+
def msrun(file, opts={})
|
|
24
|
+
p opts
|
|
25
|
+
fh = File.open(file)
|
|
26
|
+
reply = parse_and_report_io(fh, 'MsRunHeader')
|
|
27
|
+
p reply
|
|
28
|
+
abort
|
|
29
|
+
fh.close
|
|
30
|
+
end
|
|
31
|
+
|
|
32
|
+
def prec_mz_by_scan_num(file, opts={})
|
|
33
|
+
end
|
|
34
|
+
|
|
35
|
+
# could easily do this for all these guys
|
|
36
|
+
#def method_missing(*args)
|
|
37
|
+
# method = args.shift
|
|
38
|
+
# parse_and_report(
|
|
39
|
+
#end
|
|
40
|
+
|
|
41
|
+
end
|
|
42
|
+
|
|
43
|
+
class MS::Parser::MzXML::XMLParser::MsRunHeader < XMLParser
|
|
44
|
+
def initialize(version='1.0')
|
|
45
|
+
@version = version
|
|
46
|
+
@atts = []
|
|
47
|
+
end
|
|
48
|
+
|
|
49
|
+
def startElement(name,attrs)
|
|
50
|
+
case name
|
|
51
|
+
when 'msRun'
|
|
52
|
+
@atts = attrs.values_at(%w(scanCount startTime endTime))
|
|
53
|
+
end
|
|
54
|
+
end
|
|
55
|
+
|
|
56
|
+
def endElement(name)
|
|
57
|
+
if name == 'dataProcessing'
|
|
58
|
+
done
|
|
59
|
+
reset
|
|
60
|
+
end
|
|
61
|
+
end
|
|
62
|
+
end
|
|
63
|
+
|
|
64
|
+
class MS::Parser::MzXML::XMLParser::Spectrum < XMLParser
|
|
65
|
+
@@scan_atts = %w(num msLevel retentionTime startMz endMz)
|
|
66
|
+
@@precursor_mz_atts = %w(precursorIntensity)
|
|
67
|
+
|
|
68
|
+
|
|
69
|
+
def initialize(version='1.0')
|
|
70
|
+
@version = version
|
|
71
|
+
@spectrum = []
|
|
72
|
+
@current_scan = nil
|
|
73
|
+
end
|
|
74
|
+
|
|
75
|
+
def report
|
|
76
|
+
@spectrum
|
|
77
|
+
end
|
|
78
|
+
|
|
79
|
+
def startElement(name,attrs)
|
|
80
|
+
if name == 'scan'
|
|
81
|
+
vals = attrs.values_at(@@scan_atts)
|
|
82
|
+
vals[2] = vals[2][2...-1].to_f #remove PT and trailing S
|
|
83
|
+
[0, 1].each do |i| vals[i] = vals[i].to_i end # num and ms_level
|
|
84
|
+
[3, 4].each do |i| vals[i] = vals[i].to_f end # start_mz and end_mz
|
|
85
|
+
@current_scan = MS::Scan.new(vals)
|
|
86
|
+
elsif name == 'precursorMz'
|
|
87
|
+
# 5, 6, 7 are the scans indices for prec_mz prec_inten and parent
|
|
88
|
+
@current_scan[6] = attrs['precursorIntensity'].to_f
|
|
89
|
+
@current_scan[5] = ''
|
|
90
|
+
@get_precursor_mz = true
|
|
91
|
+
elsif name == 'peaks'
|
|
92
|
+
@precision = attrs['precision'].to_i
|
|
93
|
+
@get_peaks = true
|
|
94
|
+
@current_peaks_string = ''
|
|
95
|
+
end
|
|
96
|
+
end
|
|
97
|
+
|
|
98
|
+
def endElement(name)
|
|
99
|
+
if name == 'peaks'
|
|
100
|
+
@get_peaks = false
|
|
101
|
+
@spectrum << Spectrum.new(@current_peaks_string, @precision)
|
|
102
|
+
@spectrum.context = @current_scan
|
|
103
|
+
elsif name == 'precursorMz'
|
|
104
|
+
@current_scan[5] = @current_scan[5].to_f
|
|
105
|
+
@get_precursor_mz = false
|
|
106
|
+
end
|
|
107
|
+
end
|
|
108
|
+
|
|
109
|
+
def character(data)
|
|
110
|
+
if @get_peaks
|
|
111
|
+
@current_peaks_string << data
|
|
112
|
+
elsif @get_precursor_mz
|
|
113
|
+
@current_scan[5] << data
|
|
114
|
+
end
|
|
115
|
+
end
|
|
116
|
+
|
|
117
|
+
end
|
|
118
|
+
|
|
119
|
+
|
|
120
|
+
|
|
121
|
+
|
|
122
|
+
class MS::Parser::MzXML::XMLParser::PrecMzByNum < XMLParser
|
|
123
|
+
@scan_num = nil
|
|
124
|
+
@get_data = false
|
|
125
|
+
|
|
126
|
+
attr_accessor :prec_mz
|
|
127
|
+
alias_method :report, :prec_mz
|
|
128
|
+
|
|
129
|
+
def initialize
|
|
130
|
+
@prec_mz = []
|
|
131
|
+
end
|
|
132
|
+
|
|
133
|
+
def startElement(name,attrs)
|
|
134
|
+
if name == "scan"
|
|
135
|
+
@scan_num = attrs["num"].to_i
|
|
136
|
+
elsif name == "precursorMz"
|
|
137
|
+
@current_prec_mz = ""
|
|
138
|
+
@get_data = true
|
|
139
|
+
end
|
|
140
|
+
end
|
|
141
|
+
|
|
142
|
+
def endElement(name)
|
|
143
|
+
if name == "precursorMz"
|
|
144
|
+
@get_data = false
|
|
145
|
+
@prec_mz[@scan_num] = @current_prec_mz.to_f
|
|
146
|
+
end
|
|
147
|
+
end
|
|
148
|
+
|
|
149
|
+
def character(data)
|
|
150
|
+
if @get_data
|
|
151
|
+
@current_prec_mz << data
|
|
152
|
+
end
|
|
153
|
+
end
|
|
154
|
+
|
|
155
|
+
end
|
|
156
|
+
|
|
157
|
+
|
|
158
|
+
=begin
|
|
159
|
+
|
|
160
|
+
|
|
161
|
+
# Returns parallel arrays (times, spectra) where each spectra is an array
|
|
162
|
+
# containing alternating mz and intensity (MS1 scans only)
|
|
163
|
+
# and times are strings with the time in seconds
|
|
164
|
+
class MS::Parser::MzXML::XMLParser::TimesAndSpectra < XMLParser
|
|
165
|
+
include MS::Parser::MzXML
|
|
166
|
+
@@get_data = false
|
|
167
|
+
@@get_peaks = false
|
|
168
|
+
@@precision = 32 # @TODO: set dynamic
|
|
169
|
+
|
|
170
|
+
attr_accessor :times, :spectra
|
|
171
|
+
def times_and_spectra
|
|
172
|
+
[@times, @spectra]
|
|
173
|
+
end
|
|
174
|
+
|
|
175
|
+
alias_method :report, :times_and_spectra
|
|
176
|
+
|
|
177
|
+
def initialize(ms_level=1)
|
|
178
|
+
@ms_level = "#{ms_level}"
|
|
179
|
+
@times = []
|
|
180
|
+
@spectra = []
|
|
181
|
+
end
|
|
182
|
+
|
|
183
|
+
def startElement(name,attrs)
|
|
184
|
+
if name == "scan" && attrs["msLevel"] == @ms_level
|
|
185
|
+
@times << attrs["retentionTime"][2...-1] # strip PT and S: "PTx.xxxxS"
|
|
186
|
+
@@get_peaks = true
|
|
187
|
+
elsif name == "peaks" && @@get_peaks
|
|
188
|
+
@@get_data = true
|
|
189
|
+
@data = ""
|
|
190
|
+
end
|
|
191
|
+
end
|
|
192
|
+
|
|
193
|
+
def character(data)
|
|
194
|
+
if @@get_data
|
|
195
|
+
@data << data
|
|
196
|
+
end
|
|
197
|
+
end
|
|
198
|
+
|
|
199
|
+
def endElement(name)
|
|
200
|
+
if name == "peaks" && @@get_peaks
|
|
201
|
+
@spectra << base64_peaks_to_array(@data, @@precision)
|
|
202
|
+
@@get_data = false
|
|
203
|
+
@@get_peaks = false
|
|
204
|
+
end
|
|
205
|
+
end
|
|
206
|
+
|
|
207
|
+
end
|
|
208
|
+
|
|
209
|
+
|
|
210
|
+
class MS::Parser::MzXML::XMLParser::TimeMzIntenIndexer < XMLParser
|
|
211
|
+
|
|
212
|
+
@@scan_num = nil
|
|
213
|
+
@@get_data = false
|
|
214
|
+
|
|
215
|
+
attr_accessor :scans_by_num
|
|
216
|
+
alias_method :report, :scans_by_num
|
|
217
|
+
|
|
218
|
+
def initialize
|
|
219
|
+
@current_scan = nil
|
|
220
|
+
@scans_by_num = []
|
|
221
|
+
end
|
|
222
|
+
|
|
223
|
+
def startElement(name,attrs)
|
|
224
|
+
if name == "scan"
|
|
225
|
+
num = attrs["num"].to_i
|
|
226
|
+
@current_scan = MS::Scan.new(num, attrs["msLevel"].to_i, attrs["retentionTime"].gsub(/^PT/,'').gsub(/S$/,'').to_f)
|
|
227
|
+
scans_by_num[num] = @current_scan
|
|
228
|
+
elsif name == "precursorMz"
|
|
229
|
+
@current_scan.prec_inten = attrs["precursorIntensity"].to_f
|
|
230
|
+
@@get_data = true
|
|
231
|
+
end
|
|
232
|
+
end
|
|
233
|
+
|
|
234
|
+
def endElement(name)
|
|
235
|
+
if name == "precursorMz"
|
|
236
|
+
@@get_data = false
|
|
237
|
+
end
|
|
238
|
+
end
|
|
239
|
+
|
|
240
|
+
def character(data)
|
|
241
|
+
if @@get_data
|
|
242
|
+
@current_scan.prec_mz = data
|
|
243
|
+
end
|
|
244
|
+
end
|
|
245
|
+
|
|
246
|
+
end
|
|
247
|
+
|
|
248
|
+
=end
|
|
@@ -0,0 +1,175 @@
|
|
|
1
|
+
require 'ms/msrun'
|
|
2
|
+
|
|
3
|
+
module MS; end
|
|
4
|
+
|
|
5
|
+
module MS::Parser::MzXML
|
|
6
|
+
Base_dir_for_parsers = 'ms/parser/mzxml'
|
|
7
|
+
# inherits XMLStyleParser and version
|
|
8
|
+
include MS::Parser
|
|
9
|
+
include XMLStyleParser
|
|
10
|
+
|
|
11
|
+
# returns a specific parser MS::Parser::MzXML::#{ParserType}
|
|
12
|
+
# based on choose_parser from xml_style_parser
|
|
13
|
+
def self.new(parse_type=:msrun, version='1.0')
|
|
14
|
+
@version = version
|
|
15
|
+
@method = parse_type
|
|
16
|
+
XMLStyleParser.require_parse_files(Base_dir_for_parsers)
|
|
17
|
+
parser_class = XMLStyleParser.choose_parser(self, parse_type)
|
|
18
|
+
parser = parser_class.new(parse_type, version)
|
|
19
|
+
end
|
|
20
|
+
|
|
21
|
+
# Returns an array of scans indexed by scan number
|
|
22
|
+
# NOTE that the first scan (zero indexed) will likely be nil!
|
|
23
|
+
# accepts an optional parse_type = 'xmlparser' | 'rexml'
|
|
24
|
+
def scans_by_num(mzXML_file, parse_type=nil)
|
|
25
|
+
unless parse_type
|
|
26
|
+
parse_type = default_parser
|
|
27
|
+
end
|
|
28
|
+
scans = []
|
|
29
|
+
case parse_type
|
|
30
|
+
when 'xmlparser'
|
|
31
|
+
parser = MS::MzXML::XMLParser::TimeMzIntenIndexer.new
|
|
32
|
+
parser.parse(IO.read(mzXML_file))
|
|
33
|
+
scans = parser.scans_by_num
|
|
34
|
+
when 'rexml' # use REXML
|
|
35
|
+
# This is really too slow for files of this size
|
|
36
|
+
doc = REXML::Document.new File.new(mzXML_file)
|
|
37
|
+
doc.elements.each('msRun/scan') do |scan|
|
|
38
|
+
rt = scan.attributes['retentionTime'] ## like PT0.154000S"
|
|
39
|
+
level = scan.attributes['msLevel']
|
|
40
|
+
to_print = []
|
|
41
|
+
prec_mz = nil
|
|
42
|
+
prec_int = nil
|
|
43
|
+
if level.to_i != 1
|
|
44
|
+
scan.elements.each("precursorMz") do |prec|
|
|
45
|
+
prec_mz = prec.text.to_f
|
|
46
|
+
prec_int = prec.attributes["precursorIntensity"].to_f
|
|
47
|
+
end
|
|
48
|
+
end
|
|
49
|
+
# remove the leading PT and trailing S on the retention time!
|
|
50
|
+
rt = rt[2...-1]
|
|
51
|
+
|
|
52
|
+
num = scan.attributes['num'].to_i
|
|
53
|
+
scans[num] = MS::Scan.new(num, scan.attributes['msLevel'].to_i, rt.to_f, prec_mz, prec_int)
|
|
54
|
+
end #doc.elements
|
|
55
|
+
else
|
|
56
|
+
throw ArgumentError, "invalid parse type: #{parse_type}"
|
|
57
|
+
end
|
|
58
|
+
## update the scans for parents
|
|
59
|
+
MS::Scan.add_parent_scan(scans)
|
|
60
|
+
scans
|
|
61
|
+
end
|
|
62
|
+
|
|
63
|
+
# Returns a Hash indexed by filename (with no extension) for a given path
|
|
64
|
+
# extension = glob (string) or regex
|
|
65
|
+
# The basename is given as: file.split('.').first
|
|
66
|
+
def precursor_mz_by_scan_for_path(path, extension, parse_type=nil)
|
|
67
|
+
hash = {}
|
|
68
|
+
Dir.chdir path do
|
|
69
|
+
files = []
|
|
70
|
+
if extension.class == String
|
|
71
|
+
files = Dir[extension]
|
|
72
|
+
elsif extension.class == Regexp
|
|
73
|
+
files = Dir.entries(".").find_all do |dir|
|
|
74
|
+
dir =~ extension
|
|
75
|
+
end
|
|
76
|
+
else
|
|
77
|
+
puts "extension: #{extension} not a String or Regexp!"
|
|
78
|
+
end
|
|
79
|
+
files.each do |file|
|
|
80
|
+
base = file.split('.').first
|
|
81
|
+
hash[base] = precursor_mz_by_scan(file, parse_type)
|
|
82
|
+
end
|
|
83
|
+
end
|
|
84
|
+
hash
|
|
85
|
+
end
|
|
86
|
+
|
|
87
|
+
# Returns hash where hash[scan_num] = [precursorMz, precursorIntensity]
|
|
88
|
+
# Parent scans are not hashed
|
|
89
|
+
# Keys and values are both strings
|
|
90
|
+
def precursor_mz_and_inten_by_scan(file)
|
|
91
|
+
# in progress
|
|
92
|
+
end
|
|
93
|
+
|
|
94
|
+
# Returns array where array[scan_num] = precursorMz
|
|
95
|
+
# precursorMz are Floats
|
|
96
|
+
# Array index likely starts at 1!
|
|
97
|
+
def precursor_mz_by_scan_num(file)
|
|
98
|
+
## THIS SHOULD BE CREATED IN specific XML LIBS
|
|
99
|
+
end
|
|
100
|
+
|
|
101
|
+
# Returns a hash of basic info on an mzXML run:
|
|
102
|
+
# *mzXML_elemt* *hash keys (symbols)*
|
|
103
|
+
# scanCount scan_count
|
|
104
|
+
# startTime start_time
|
|
105
|
+
# endTime end_time
|
|
106
|
+
# startMz start_mz
|
|
107
|
+
# endMz end_mz
|
|
108
|
+
def basic_info(mzxml_file)
|
|
109
|
+
puts "parsing: #{mzxml_file} #{File.exist?(mzxml_file)}" if $VERBOSE
|
|
110
|
+
hash = {}
|
|
111
|
+
scan_count_tmp = []
|
|
112
|
+
(1..5).to_a.each do |n| scan_count_tmp[n] = 0 end
|
|
113
|
+
@fh = File.open(mzxml_file)
|
|
114
|
+
@line = ""
|
|
115
|
+
scan_count_tmp[0] = _el("scanCount").to_i
|
|
116
|
+
hash[:start_time] = _el("startTime").sub(/^PT/, "").sub(/S$/,"").to_f
|
|
117
|
+
hash[:end_time] = _el("endTime").sub(/^PT/, "").sub(/S$/,"").to_f
|
|
118
|
+
hash[:ms_level] = _el("msLevel").to_i
|
|
119
|
+
scan_count_tmp[1] = 1
|
|
120
|
+
if hash[:ms_level] == 1
|
|
121
|
+
hash[:start_mz] = _el("startMz").to_f
|
|
122
|
+
hash[:end_mz] = _el("endMz").to_f
|
|
123
|
+
end
|
|
124
|
+
|
|
125
|
+
while !@fh.eof?
|
|
126
|
+
@line = @fh.readline
|
|
127
|
+
ms_level = _el("msLevel")
|
|
128
|
+
if ms_level
|
|
129
|
+
scan_count_tmp[ms_level.to_i] += 1
|
|
130
|
+
else
|
|
131
|
+
break
|
|
132
|
+
end
|
|
133
|
+
end
|
|
134
|
+
scan_count = []
|
|
135
|
+
scan_count_tmp.each do |cnt|
|
|
136
|
+
if cnt != 0
|
|
137
|
+
scan_count.push cnt
|
|
138
|
+
else
|
|
139
|
+
break
|
|
140
|
+
end
|
|
141
|
+
end
|
|
142
|
+
hash[:scan_count] = scan_count
|
|
143
|
+
@fh.close
|
|
144
|
+
hash
|
|
145
|
+
end
|
|
146
|
+
|
|
147
|
+
# returns [start_mz, end_mz] of the first full scan (ms_level == 1)
|
|
148
|
+
def start_and_end_mz(mzxml_file)
|
|
149
|
+
@fh = File.open(mzxml_file)
|
|
150
|
+
ms_level = 0
|
|
151
|
+
@line = ""
|
|
152
|
+
while ms_level != 1
|
|
153
|
+
ms_level = _el("msLevel").to_i
|
|
154
|
+
end
|
|
155
|
+
start_mz = _el("startMz").to_f
|
|
156
|
+
end_mz = _el("endMz").to_f
|
|
157
|
+
@fh.close
|
|
158
|
+
[start_mz, end_mz]
|
|
159
|
+
end
|
|
160
|
+
|
|
161
|
+
def _el(name)
|
|
162
|
+
re = /#{name}="(.*)"/
|
|
163
|
+
while @line !~ re && !@fh.eof?
|
|
164
|
+
@line = @fh.readline
|
|
165
|
+
end
|
|
166
|
+
if $1
|
|
167
|
+
return $1.dup
|
|
168
|
+
else
|
|
169
|
+
return nil
|
|
170
|
+
end
|
|
171
|
+
end
|
|
172
|
+
|
|
173
|
+
end
|
|
174
|
+
|
|
175
|
+
|