mspire 0.2.4 → 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/INSTALL +1 -0
- data/README +25 -0
- data/Rakefile +129 -40
- data/bin/{find_aa_freq.rb → aafreqs.rb} +2 -2
- data/bin/bioworks_to_pepxml.rb +1 -0
- data/bin/fasta_shaker.rb +1 -96
- data/bin/filter_and_validate.rb +5 -0
- data/bin/{mzxml_to_lmat.rb → ms_to_lmat.rb} +8 -7
- data/bin/prob_validate.rb +6 -0
- data/bin/raw_to_mzXML.rb +2 -2
- data/bin/srf_group.rb +1 -0
- data/bin/srf_to_sqt.rb +40 -0
- data/changelog.txt +68 -0
- data/lib/align/chams.rb +6 -6
- data/lib/align.rb +4 -3
- data/lib/bsearch.rb +120 -0
- data/lib/fasta.rb +318 -86
- data/lib/group_by.rb +10 -0
- data/lib/index_by.rb +11 -0
- data/lib/merge_deep.rb +21 -0
- data/lib/{spec → ms/converter}/mzxml.rb +77 -109
- data/lib/ms/gradient_program.rb +171 -0
- data/lib/ms/msrun.rb +209 -0
- data/lib/{spec/msrun.rb → ms/msrun_index.rb} +7 -40
- data/lib/ms/parser/mzdata/axml.rb +12 -0
- data/lib/ms/parser/mzdata/dom.rb +160 -0
- data/lib/ms/parser/mzdata/libxml.rb +7 -0
- data/lib/ms/parser/mzdata.rb +25 -0
- data/lib/ms/parser/mzxml/axml.rb +11 -0
- data/lib/ms/parser/mzxml/dom.rb +159 -0
- data/lib/ms/parser/mzxml/hpricot.rb +253 -0
- data/lib/ms/parser/mzxml/libxml.rb +15 -0
- data/lib/ms/parser/mzxml/regexp.rb +122 -0
- data/lib/ms/parser/mzxml/rexml.rb +72 -0
- data/lib/ms/parser/mzxml/xmlparser.rb +248 -0
- data/lib/ms/parser/mzxml.rb +175 -0
- data/lib/ms/parser.rb +108 -0
- data/lib/ms/precursor.rb +10 -0
- data/lib/ms/scan.rb +81 -0
- data/lib/ms/spectrum.rb +193 -0
- data/lib/ms.rb +10 -0
- data/lib/mspire.rb +4 -0
- data/lib/roc.rb +61 -1
- data/lib/sample_enzyme.rb +31 -8
- data/lib/scan_i.rb +21 -0
- data/lib/spec_id/aa_freqs.rb +7 -3
- data/lib/spec_id/bioworks.rb +20 -14
- data/lib/spec_id/digestor.rb +139 -0
- data/lib/spec_id/mass.rb +116 -0
- data/lib/spec_id/parser/proph.rb +236 -0
- data/lib/spec_id/precision/filter/cmdline.rb +209 -0
- data/lib/spec_id/precision/filter/interactive.rb +134 -0
- data/lib/spec_id/precision/filter/output.rb +147 -0
- data/lib/spec_id/precision/filter.rb +623 -0
- data/lib/spec_id/precision/output.rb +60 -0
- data/lib/spec_id/precision/prob/cmdline.rb +139 -0
- data/lib/spec_id/precision/prob/output.rb +88 -0
- data/lib/spec_id/precision/prob.rb +171 -0
- data/lib/spec_id/proph/pep_summary.rb +92 -0
- data/lib/spec_id/proph/prot_summary.rb +484 -0
- data/lib/spec_id/proph.rb +2 -466
- data/lib/spec_id/protein_summary.rb +2 -2
- data/lib/spec_id/sequest/params.rb +316 -0
- data/lib/spec_id/sequest/pepxml.rb +1513 -0
- data/lib/spec_id/sequest.rb +2 -1672
- data/lib/spec_id/srf.rb +445 -177
- data/lib/spec_id.rb +183 -95
- data/lib/spec_id_xml.rb +8 -10
- data/lib/transmem/phobius.rb +147 -0
- data/lib/transmem/toppred.rb +368 -0
- data/lib/transmem.rb +157 -0
- data/lib/validator/aa.rb +135 -0
- data/lib/validator/background.rb +73 -0
- data/lib/validator/bias.rb +95 -0
- data/lib/validator/cmdline.rb +260 -0
- data/lib/validator/decoy.rb +94 -0
- data/lib/validator/digestion_based.rb +69 -0
- data/lib/validator/probability.rb +48 -0
- data/lib/validator/prot_from_pep.rb +234 -0
- data/lib/validator/transmem.rb +272 -0
- data/lib/validator/true_pos.rb +46 -0
- data/lib/validator.rb +214 -0
- data/lib/xml.rb +38 -0
- data/lib/xml_style_parser.rb +105 -0
- data/lib/xmlparser_wrapper.rb +19 -0
- data/script/compile_and_plot_smriti_final.rb +97 -0
- data/script/extract_gradient_programs.rb +56 -0
- data/script/get_apex_values_rexml.rb +44 -0
- data/script/mzXML2timeIndex.rb +1 -1
- data/script/smriti_final_analysis.rb +103 -0
- data/script/toppred_to_yaml.rb +47 -0
- data/script/tpp_installer.rb +1 -1
- data/{test/tc_align.rb → specs/align_spec.rb} +21 -27
- data/{test/tc_bioworks_to_pepxml.rb → specs/bin/bioworks_to_pepxml_spec.rb} +25 -41
- data/specs/bin/fasta_shaker_spec.rb +259 -0
- data/specs/bin/filter_and_validate__multiple_vals_helper.yaml +202 -0
- data/specs/bin/filter_and_validate_spec.rb +124 -0
- data/specs/bin/ms_to_lmat_spec.rb +34 -0
- data/specs/bin/prob_validate_spec.rb +62 -0
- data/specs/bin/protein_summary_spec.rb +10 -0
- data/{test/tc_fasta.rb → specs/fasta_spec.rb} +354 -310
- data/specs/gi_spec.rb +22 -0
- data/specs/load_bin_path.rb +7 -0
- data/specs/merge_deep_spec.rb +13 -0
- data/specs/ms/gradient_program_spec.rb +77 -0
- data/specs/ms/msrun_spec.rb +455 -0
- data/specs/ms/parser_spec.rb +92 -0
- data/specs/ms/spectrum_spec.rb +89 -0
- data/specs/roc_spec.rb +251 -0
- data/specs/rspec_autotest.rb +149 -0
- data/specs/sample_enzyme_spec.rb +41 -0
- data/specs/spec_helper.rb +133 -0
- data/specs/spec_id/aa_freqs_spec.rb +52 -0
- data/{test/tc_bioworks.rb → specs/spec_id/bioworks_spec.rb} +56 -71
- data/specs/spec_id/digestor_spec.rb +75 -0
- data/specs/spec_id/precision/filter/cmdline_spec.rb +20 -0
- data/specs/spec_id/precision/filter/output_spec.rb +31 -0
- data/specs/spec_id/precision/filter_spec.rb +243 -0
- data/specs/spec_id/precision/prob_spec.rb +111 -0
- data/specs/spec_id/precision/prob_spec_helper.rb +0 -0
- data/specs/spec_id/proph/pep_summary_spec.rb +143 -0
- data/{test/tc_proph.rb → specs/spec_id/proph/prot_summary_spec.rb} +52 -32
- data/{test/tc_protein_summary.rb → specs/spec_id/protein_summary_spec.rb} +85 -0
- data/specs/spec_id/sequest/params_spec.rb +68 -0
- data/specs/spec_id/sequest/pepxml_spec.rb +452 -0
- data/specs/spec_id/sqt_spec.rb +138 -0
- data/specs/spec_id/srf_spec.rb +209 -0
- data/specs/spec_id/srf_spec_helper.rb +302 -0
- data/specs/spec_id_helper.rb +33 -0
- data/specs/spec_id_spec.rb +361 -0
- data/specs/spec_id_xml_spec.rb +33 -0
- data/specs/transmem/phobius_spec.rb +423 -0
- data/specs/transmem/toppred_spec.rb +297 -0
- data/specs/transmem_spec.rb +60 -0
- data/specs/transmem_spec_shared.rb +64 -0
- data/specs/validator/aa_spec.rb +107 -0
- data/specs/validator/background_spec.rb +51 -0
- data/specs/validator/bias_spec.rb +146 -0
- data/specs/validator/decoy_spec.rb +51 -0
- data/specs/validator/fasta_helper.rb +26 -0
- data/specs/validator/prot_from_pep_spec.rb +141 -0
- data/specs/validator/transmem_spec.rb +145 -0
- data/specs/validator/true_pos_spec.rb +58 -0
- data/specs/validator_helper.rb +33 -0
- data/specs/xml_spec.rb +12 -0
- data/test_files/000_pepxml18_small.xml +206 -0
- data/test_files/020a.mzXML.timeIndex +4710 -0
- data/test_files/4-03-03_mzXML/000.mzXML.timeIndex +3973 -0
- data/test_files/4-03-03_mzXML/020.mzXML.timeIndex +3872 -0
- data/test_files/4-03-03_small-prot.xml +321 -0
- data/test_files/4-03-03_small.xml +3876 -0
- data/test_files/7MIX_STD_110802_1.sequest_params_fragment.srf +0 -0
- data/test_files/bioworks-3.3_10prots.xml +5999 -0
- data/test_files/bioworks31.params +77 -0
- data/test_files/bioworks32.params +62 -0
- data/test_files/bioworks33.params +63 -0
- data/test_files/bioworks_single_run_small.xml +7237 -0
- data/test_files/bioworks_small.fasta +212 -0
- data/test_files/bioworks_small.params +63 -0
- data/test_files/bioworks_small.phobius +109 -0
- data/test_files/bioworks_small.toppred.out +2847 -0
- data/test_files/bioworks_small.xml +5610 -0
- data/test_files/bioworks_with_INV_small.xml +3753 -0
- data/test_files/bioworks_with_SHUFF_small.xml +2503 -0
- data/test_files/corrupted_900.srf +0 -0
- data/test_files/head_of_7MIX.srf +0 -0
- data/test_files/interact-opd1_mods_small-prot.xml +304 -0
- data/test_files/messups.fasta +297 -0
- data/test_files/opd1/000.my_answer.100lines.xml +101 -0
- data/test_files/opd1/000.tpp_1.2.3.first10.xml +115 -0
- data/test_files/opd1/000.tpp_2.9.2.first10.xml +126 -0
- data/test_files/opd1/000.v2.1.mzXML.timeIndex +3748 -0
- data/test_files/opd1/000_020-prot.png +0 -0
- data/test_files/opd1/000_020_3prots-prot.mod_initprob.xml +62 -0
- data/test_files/opd1/000_020_3prots-prot.xml +62 -0
- data/test_files/opd1/opd1_cat_inv_small-prot.xml +139 -0
- data/test_files/opd1/sequest.3.1.params +77 -0
- data/test_files/opd1/sequest.3.2.params +62 -0
- data/test_files/opd1/twenty_scans.mzXML +418 -0
- data/test_files/opd1/twenty_scans.v2.1.mzXML +382 -0
- data/test_files/opd1/twenty_scans_answ.lmat +0 -0
- data/test_files/opd1/twenty_scans_answ.lmata +9 -0
- data/test_files/opd1_020_beginning.RAW +0 -0
- data/test_files/opd1_2runs_2mods/interact-opd1_mods__small.xml +753 -0
- data/test_files/orbitrap_mzData/000_cut.xml +1920 -0
- data/test_files/pepproph_small.xml +4691 -0
- data/test_files/phobius.small.noheader.txt +50 -0
- data/test_files/phobius.small.small.txt +53 -0
- data/test_files/s01_anC1_ld020mM.key.txt +25 -0
- data/test_files/s01_anC1_ld020mM.meth +0 -0
- data/test_files/small.fasta +297 -0
- data/test_files/smallraw.RAW +0 -0
- data/test_files/tf_bioworks2excel.bioXML +14340 -0
- data/test_files/tf_bioworks2excel.txt.actual +1035 -0
- data/test_files/toppred.small.out +416 -0
- data/test_files/toppred.xml.out +318 -0
- data/test_files/validator_hits_separate/bias_bioworks_small_HS.fasta +7 -0
- data/test_files/validator_hits_separate/bioworks_small_HS.xml +5651 -0
- data/test_files/yeast_gly_small-prot.xml +265 -0
- data/test_files/yeast_gly_small.1.0_1.0_1.0.parentTimes +6 -0
- data/test_files/yeast_gly_small.xml +3807 -0
- data/test_files/yeast_gly_small2.parentTimes +6 -0
- metadata +273 -57
- data/bin/filter.rb +0 -6
- data/bin/precision.rb +0 -5
- data/lib/spec/mzdata/parser.rb +0 -108
- data/lib/spec/mzdata.rb +0 -48
- data/lib/spec/mzxml/parser.rb +0 -449
- data/lib/spec/scan.rb +0 -55
- data/lib/spec_id/filter.rb +0 -797
- data/lib/spec_id/precision.rb +0 -421
- data/lib/toppred.rb +0 -18
- data/script/filter-peps.rb +0 -164
- data/test/tc_aa_freqs.rb +0 -59
- data/test/tc_fasta_shaker.rb +0 -149
- data/test/tc_filter.rb +0 -203
- data/test/tc_filter_peps.rb +0 -46
- data/test/tc_gi.rb +0 -17
- data/test/tc_id_class_anal.rb +0 -70
- data/test/tc_id_precision.rb +0 -89
- data/test/tc_msrun.rb +0 -88
- data/test/tc_mzxml.rb +0 -88
- data/test/tc_mzxml_to_lmat.rb +0 -36
- data/test/tc_peptide_parent_times.rb +0 -27
- data/test/tc_precision.rb +0 -60
- data/test/tc_roc.rb +0 -166
- data/test/tc_sample_enzyme.rb +0 -32
- data/test/tc_scan.rb +0 -26
- data/test/tc_sequest.rb +0 -336
- data/test/tc_spec.rb +0 -78
- data/test/tc_spec_id.rb +0 -201
- data/test/tc_spec_id_xml.rb +0 -36
- data/test/tc_srf.rb +0 -262
|
@@ -0,0 +1,160 @@
|
|
|
1
|
+
require 'xml_style_parser'
|
|
2
|
+
require 'ms/spectrum'
|
|
3
|
+
require 'ms/scan'
|
|
4
|
+
|
|
5
|
+
module MS::Parser::MzData ; end
|
|
6
|
+
|
|
7
|
+
class MS::Parser::MzData::DOM
|
|
8
|
+
include XMLStyleParser
|
|
9
|
+
include MS::Parser::MzData
|
|
10
|
+
|
|
11
|
+
def initialize(parse_type=:msrun, version='1.0')
|
|
12
|
+
@method = parse_type
|
|
13
|
+
@version = version
|
|
14
|
+
end
|
|
15
|
+
|
|
16
|
+
# true if there is a node <dataProcessing><software><name>Bioworks Browser</...>
|
|
17
|
+
# otherwise false
|
|
18
|
+
def is_bioworks33?(description_node)
|
|
19
|
+
begin
|
|
20
|
+
software_node = description_node.find_first('child::dataProcessing').find_first('child::software')
|
|
21
|
+
name = software_node.find_first('child::name').content
|
|
22
|
+
version = software_node.find_first('child::version').content
|
|
23
|
+
((name == 'Bioworks Browser') and (version == '3.3'))
|
|
24
|
+
rescue
|
|
25
|
+
false
|
|
26
|
+
end
|
|
27
|
+
end
|
|
28
|
+
|
|
29
|
+
# OPTIONS:
|
|
30
|
+
# :msrun => MSRun # use this object instead of creating one
|
|
31
|
+
# :spectra => *true|false # if false don't get spectra
|
|
32
|
+
def msrun(file, opts={})
|
|
33
|
+
unless opts.key?(:spectra)
|
|
34
|
+
opts[:spectra] = true
|
|
35
|
+
end
|
|
36
|
+
msrun_obj =
|
|
37
|
+
if x = opts[:msrun]
|
|
38
|
+
msrun_obj = x
|
|
39
|
+
else
|
|
40
|
+
MS::MSRun.new
|
|
41
|
+
end
|
|
42
|
+
# should ensure that parsing is not counting spaces...
|
|
43
|
+
|
|
44
|
+
# a string we'd parse like this:
|
|
45
|
+
# doc = XML::Parser.string(st).parse
|
|
46
|
+
|
|
47
|
+
# WE NEED TO GET scan_count, start_time and end_time!!!!
|
|
48
|
+
id_to_scan_hash = {}
|
|
49
|
+
|
|
50
|
+
# 0 1 2 3 4 5 6
|
|
51
|
+
# %w(num msLevel retentionTime startMz endMz precursors spectrum)
|
|
52
|
+
|
|
53
|
+
root = get_root_node_from_file(file)
|
|
54
|
+
scan_count = 0
|
|
55
|
+
description = root.find_first('child::description')
|
|
56
|
+
bioworks33 = is_bioworks33?(description)
|
|
57
|
+
spectrum_list = description.next
|
|
58
|
+
scans =
|
|
59
|
+
if bioworks33
|
|
60
|
+
[] #bioworks33 gives incorrect scan numbers!
|
|
61
|
+
else
|
|
62
|
+
Array(spectrum_list['count'].to_i)
|
|
63
|
+
end
|
|
64
|
+
|
|
65
|
+
# if I move from node to node, it means I've checked that it's a sequence
|
|
66
|
+
# and that the elements are req'd
|
|
67
|
+
if spectrum_list.child?
|
|
68
|
+
spectrum_n = spectrum_list.child
|
|
69
|
+
loop do
|
|
70
|
+
scan_count += 1
|
|
71
|
+
scan = MS::Scan.new(9)
|
|
72
|
+
id = spectrum_n["id"].to_i
|
|
73
|
+
id_to_scan_hash[id] = scan
|
|
74
|
+
spec_desc_n = spectrum_n.child # required in sequence
|
|
75
|
+
spec_settings_n = spec_desc_n.child # required in sequence
|
|
76
|
+
if acq_n = spec_settings_n.find_first('descendant::acquisition')
|
|
77
|
+
scan[0] = acq_n['acqNumber'].to_i
|
|
78
|
+
else
|
|
79
|
+
scan[0] = id
|
|
80
|
+
end
|
|
81
|
+
spec_inst_n = spec_settings_n.find_first('child::spectrumInstrument')
|
|
82
|
+
scan[1] = spec_inst_n['msLevel'].to_i
|
|
83
|
+
|
|
84
|
+
if bioworks33
|
|
85
|
+
scans << scan # we can't trust the scan count!
|
|
86
|
+
else
|
|
87
|
+
scans[scan_count] = scan
|
|
88
|
+
end
|
|
89
|
+
|
|
90
|
+
scan[3] = spec_inst_n['mzRangeStart'].to_f
|
|
91
|
+
scan[4] = spec_inst_n['mzRangeStop'].to_f
|
|
92
|
+
spec_inst_n.find('child::cvParam').each do |cv_param|
|
|
93
|
+
if cv_param['name'] == 'TimeInMinutes'
|
|
94
|
+
scan[2] = cv_param['value'].to_f * 60 #convert to seconds
|
|
95
|
+
end
|
|
96
|
+
end
|
|
97
|
+
if scan[1] > 1 # precursormz info
|
|
98
|
+
prec_list_n = spec_settings_n.next
|
|
99
|
+
abort('can only process one precursor m/z right now!') if prec_list_n['count'] != '1'
|
|
100
|
+
precursors = prec_list_n.find('child::precursor').map do |prec_n|
|
|
101
|
+
# %w(mz inten parent ms_level parent charge_states)
|
|
102
|
+
prec = MS::Precursor.new
|
|
103
|
+
unless bioworks33 # bioworks33 points to the wrong scan!!!
|
|
104
|
+
prec[2] = id_to_scan_hash[prec_n['spectrumRef'].to_i]
|
|
105
|
+
end
|
|
106
|
+
prec[3] = prec_n['msLevel'].to_i
|
|
107
|
+
charges = []
|
|
108
|
+
prec_n.find('descendant::cvParam').each do |cv_param_n|
|
|
109
|
+
case cv_param_n['name']
|
|
110
|
+
when 'MassToChargeRatio'
|
|
111
|
+
prec[0] = cv_param_n['value'].to_f
|
|
112
|
+
# find the prec intensity
|
|
113
|
+
unless bioworks33
|
|
114
|
+
prec[1] = prec[2].spectrum.intensity_at_mz(prec[0])
|
|
115
|
+
end
|
|
116
|
+
when 'ChargeState'
|
|
117
|
+
charges << cv_param_n['value'].to_i
|
|
118
|
+
end
|
|
119
|
+
end
|
|
120
|
+
prec[5] = charges
|
|
121
|
+
prec
|
|
122
|
+
end
|
|
123
|
+
scan[5] = precursors
|
|
124
|
+
else # no precursors
|
|
125
|
+
scan[5] = []
|
|
126
|
+
end
|
|
127
|
+
# here's the one line way of doing it, but it's probably more clear in
|
|
128
|
+
# the loop
|
|
129
|
+
#while ((mz_array_bin_n = spec_desc_n.next).name != 'mzArrayBinary') do
|
|
130
|
+
if opts[:spectra]
|
|
131
|
+
mz_array_bin_n = nil
|
|
132
|
+
loop do
|
|
133
|
+
mz_array_bin_n = spec_desc_n.next
|
|
134
|
+
break if mz_array_bin_n.name == 'mzArrayBinary'
|
|
135
|
+
end
|
|
136
|
+
data_n = mz_array_bin_n.child
|
|
137
|
+
mz = MS::Spectrum.base64_to_array(data_n.content, data_n['precision'].to_i, ((data_n['endian']=='little') ? false : true))
|
|
138
|
+
inten_array_bin_n = mz_array_bin_n.next
|
|
139
|
+
data_n = inten_array_bin_n.child
|
|
140
|
+
inten = MS::Spectrum.base64_to_array(data_n.content, data_n['precision'].to_i, ((data_n['endian']=='little') ? false : true))
|
|
141
|
+
scan[6] = MS::Spectrum.new(mz, inten)
|
|
142
|
+
end
|
|
143
|
+
|
|
144
|
+
# set up the next loop
|
|
145
|
+
break unless spectrum_n = spectrum_n.next
|
|
146
|
+
end
|
|
147
|
+
end
|
|
148
|
+
if bioworks33
|
|
149
|
+
MS::MSRun.add_parent_scan(scans, opts[:spectra])
|
|
150
|
+
end
|
|
151
|
+
msrun_obj.scans = scans
|
|
152
|
+
msrun_obj.scan_count = scan_count
|
|
153
|
+
msrun_obj.start_time = msrun_obj.scans.first.time
|
|
154
|
+
msrun_obj.end_time = msrun_obj.scans.last.time
|
|
155
|
+
end
|
|
156
|
+
|
|
157
|
+
end
|
|
158
|
+
|
|
159
|
+
|
|
160
|
+
|
|
@@ -0,0 +1,25 @@
|
|
|
1
|
+
require 'ms/msrun'
|
|
2
|
+
|
|
3
|
+
module MS; end
|
|
4
|
+
|
|
5
|
+
module MS::Parser::MzData
|
|
6
|
+
Base_dir_for_parsers = 'ms/parser/mzdata'
|
|
7
|
+
|
|
8
|
+
# inherits XMLStyleParser and version
|
|
9
|
+
include MS::Parser
|
|
10
|
+
include XMLStyleParser
|
|
11
|
+
|
|
12
|
+
# returns a specific parser MS::Parser::MzXML::#{ParserType}
|
|
13
|
+
# based on choose_parser from xml_style_parser
|
|
14
|
+
def self.new(parse_type=:msrun, version='1.05')
|
|
15
|
+
@version = version
|
|
16
|
+
@method = parse_type
|
|
17
|
+
#p self.methods.grep /choose_parser/
|
|
18
|
+
XMLStyleParser.require_parse_files(Base_dir_for_parsers)
|
|
19
|
+
parser_class = XMLStyleParser.choose_parser(self, parse_type)
|
|
20
|
+
parser = parser_class.new(parse_type, version)
|
|
21
|
+
end
|
|
22
|
+
|
|
23
|
+
end
|
|
24
|
+
|
|
25
|
+
|
|
@@ -0,0 +1,159 @@
|
|
|
1
|
+
require 'xml_style_parser'
|
|
2
|
+
require 'ms/spectrum'
|
|
3
|
+
require 'ms/scan'
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
class MS::Parser::MzXML::DOM
|
|
7
|
+
include XMLStyleParser
|
|
8
|
+
include MS::Parser::MzXML
|
|
9
|
+
|
|
10
|
+
#@@scan_atts = %w(num msLevel retentionTime startMz endMz precursors spectrum)
|
|
11
|
+
|
|
12
|
+
def initialize(parse_type=:msrun, version='1.0')
|
|
13
|
+
@method = parse_type
|
|
14
|
+
@version = version
|
|
15
|
+
end
|
|
16
|
+
|
|
17
|
+
def new_scan_from_hash(node)
|
|
18
|
+
scan = MS::Scan.new # array class creates one with 9 positions
|
|
19
|
+
scan[0] = node['num'].to_i
|
|
20
|
+
scan[1] = node['msLevel'].to_i
|
|
21
|
+
scan[2] = node['retentionTime'][2...-1].to_f
|
|
22
|
+
if x = node['startMz']
|
|
23
|
+
scan[3] = x.to_f
|
|
24
|
+
scan[4] = node['endMz'].to_f
|
|
25
|
+
end
|
|
26
|
+
scan
|
|
27
|
+
end
|
|
28
|
+
|
|
29
|
+
# takes a scan node and creates a scan object
|
|
30
|
+
# the parent scan is the one directly above it in mslevel
|
|
31
|
+
# if the
|
|
32
|
+
def create_scan(scan_n, scans_by_num, get_spectra=true)
|
|
33
|
+
if @version < '3.0'
|
|
34
|
+
scan = new_scan_from_hash(scan_n)
|
|
35
|
+
precs = []
|
|
36
|
+
scan_n.each do |node|
|
|
37
|
+
case node.name
|
|
38
|
+
when 'precursorMz'
|
|
39
|
+
# should be able to do this!!!
|
|
40
|
+
#scan[5] = scan_n.find('child::precursorMz').map do |prec_n|
|
|
41
|
+
prec = MS::Precursor.new
|
|
42
|
+
prec[1] = node['precursorIntensity'].to_f
|
|
43
|
+
prec[0] = node.content.to_f
|
|
44
|
+
if x = node['precursorScanNum']
|
|
45
|
+
prec[2] = scans_by_num[x.to_i]
|
|
46
|
+
end
|
|
47
|
+
precs << prec
|
|
48
|
+
when 'peaks'
|
|
49
|
+
next unless get_spectra
|
|
50
|
+
# SHOULD be able to do this!!
|
|
51
|
+
#peaks_n = scan_n.find_first('child::peaks')
|
|
52
|
+
scan[6] = MS::Spectrum.from_base64_peaks(node.content, node['precision'].to_i)
|
|
53
|
+
end
|
|
54
|
+
end
|
|
55
|
+
scan[5] = precs
|
|
56
|
+
scan
|
|
57
|
+
else # for version > 3.0
|
|
58
|
+
abort 'not supporting version 3.0 just yet'
|
|
59
|
+
# note that mzXML version 3.0 *can* have more than one peak...
|
|
60
|
+
# I'm not sure how to deal with that since I have one spectrum/scan
|
|
61
|
+
end
|
|
62
|
+
end
|
|
63
|
+
|
|
64
|
+
|
|
65
|
+
# returns an array of msrun objects
|
|
66
|
+
def msruns(file)
|
|
67
|
+
raise NotImplementedError
|
|
68
|
+
end
|
|
69
|
+
|
|
70
|
+
# returns a string with double </scan></scan> tags into single and missing
|
|
71
|
+
# </scan> tags after peaks added in
|
|
72
|
+
# we do this in windows style since these are generated off a windows
|
|
73
|
+
# machine only
|
|
74
|
+
def fix_bad_scan_tags(file)
|
|
75
|
+
IO.read(file).gsub(/<\/scan>\s+<\/scan>/m, '</scan>').gsub(/<\/peaks>\s+<scan/m, "</peaks>\r\n </scan>\r\n <scan")
|
|
76
|
+
end
|
|
77
|
+
|
|
78
|
+
# right now cannot parse multiple runs out of an mzXML version 2 file since
|
|
79
|
+
# this is built around a single run per file
|
|
80
|
+
# OPTIONS:
|
|
81
|
+
# :msrun => MSRun # use this object instead of creating one
|
|
82
|
+
# :spectra => *true|false # if false don't get spectra
|
|
83
|
+
def msrun(file, opts={})
|
|
84
|
+
unless opts.key?(:spectra)
|
|
85
|
+
opts[:spectra] = true
|
|
86
|
+
end
|
|
87
|
+
|
|
88
|
+
msrun_obj =
|
|
89
|
+
if x = opts[:msrun]
|
|
90
|
+
msrun_obj = x
|
|
91
|
+
else
|
|
92
|
+
MS::MSRun.new
|
|
93
|
+
end
|
|
94
|
+
|
|
95
|
+
root =
|
|
96
|
+
if @version == '2.0'
|
|
97
|
+
string = fix_bad_scan_tags(file)
|
|
98
|
+
get_root_node_from_string(string)
|
|
99
|
+
else
|
|
100
|
+
get_root_node_from_file(file)
|
|
101
|
+
end
|
|
102
|
+
|
|
103
|
+
# right now we are only finding the first msRun (probably a rare case of
|
|
104
|
+
# multiple runs in an mzXML file...)
|
|
105
|
+
msrun_n =
|
|
106
|
+
if @version >= '2.0'
|
|
107
|
+
kids = root.children.select {|v| v.name == 'msRun' }
|
|
108
|
+
raise(NotImplementedError, "one msrun per doc right now" ) if kids.size > 1
|
|
109
|
+
kids.first
|
|
110
|
+
else
|
|
111
|
+
root
|
|
112
|
+
end
|
|
113
|
+
if msrun_n.name != 'msRun'
|
|
114
|
+
raise RuntimeError, "extra node slipped in somehow"
|
|
115
|
+
end
|
|
116
|
+
|
|
117
|
+
## HEADER
|
|
118
|
+
scan_count = msrun_n['scanCount'].to_i
|
|
119
|
+
msrun_obj.scan_count = scan_count
|
|
120
|
+
scans_by_num = Array.new(scan_count + 1)
|
|
121
|
+
|
|
122
|
+
## SPECTRUM
|
|
123
|
+
parent = nil
|
|
124
|
+
scans = Array.new( scan_count )
|
|
125
|
+
scn_index = 0
|
|
126
|
+
|
|
127
|
+
# we should be able to do this, but it's not working!!!
|
|
128
|
+
#scan_n = msrun_n.find_first('scan')
|
|
129
|
+
#while (scn_index < scan_count)
|
|
130
|
+
get_spectra = opts[:spectra]
|
|
131
|
+
|
|
132
|
+
msrun_n.each do |scan_n|
|
|
133
|
+
next unless scan_n.name == 'scan'
|
|
134
|
+
scan = create_scan(scan_n, scans_by_num, get_spectra)
|
|
135
|
+
scans[scn_index] = scan
|
|
136
|
+
#sc = scan_n.next
|
|
137
|
+
scans_by_num[scan[0]] = scan
|
|
138
|
+
scn_index += 1
|
|
139
|
+
end
|
|
140
|
+
|
|
141
|
+
|
|
142
|
+
## update the scan's parents
|
|
143
|
+
MS::MSRun.add_parent_scan(scans)
|
|
144
|
+
|
|
145
|
+
# note that startTime and endTime are optional AND in >2.2 are dateTime
|
|
146
|
+
# instead of duration types!, so we will just use scan times...
|
|
147
|
+
# Also, note that startTime and endTime are BROKEN on readw -> mzXML 2.0
|
|
148
|
+
# export. They give the start and end time in seconds, but they are
|
|
149
|
+
# really minutes. All the more reason to use the first and last scans!
|
|
150
|
+
msrun_obj.start_time = scans.first.time
|
|
151
|
+
msrun_obj.end_time = scans.last.time
|
|
152
|
+
|
|
153
|
+
msrun_obj.scans = scans
|
|
154
|
+
end
|
|
155
|
+
|
|
156
|
+
end
|
|
157
|
+
|
|
158
|
+
|
|
159
|
+
|
|
@@ -0,0 +1,253 @@
|
|
|
1
|
+
|
|
2
|
+
require 'xml_style_parser'
|
|
3
|
+
require 'ms/spectrum'
|
|
4
|
+
require 'ms/scan'
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
class MS::Parser::MzXML::Hpricot
|
|
8
|
+
include XMLStyleParser
|
|
9
|
+
include MS::Parser::MzXML
|
|
10
|
+
|
|
11
|
+
@@scan_atts = %w(num msLevel retentionTime startMz endMz precursors spectrum)
|
|
12
|
+
|
|
13
|
+
def initialize(parse_type=:msrun, version='1.0')
|
|
14
|
+
@method = parse_type
|
|
15
|
+
@version = version
|
|
16
|
+
end
|
|
17
|
+
|
|
18
|
+
def new_scan_from_hash(node)
|
|
19
|
+
scan = MS::Scan.new # array class creates one with 9 positions
|
|
20
|
+
scan[0] = node['num'].to_i
|
|
21
|
+
scan[1] = node['msLevel'].to_i
|
|
22
|
+
scan[2] = node['retentionTime'][2...-1].to_f
|
|
23
|
+
if x = node['startMz']
|
|
24
|
+
scan[3] = x.to_f
|
|
25
|
+
scan[4] = node['endMz'].to_f
|
|
26
|
+
end
|
|
27
|
+
scan
|
|
28
|
+
end
|
|
29
|
+
|
|
30
|
+
# takes a scan node and creates a scan object
|
|
31
|
+
# the parent scan is the one directly above it in mslevel
|
|
32
|
+
# if the
|
|
33
|
+
def create_scan(scan_n, scans_by_num, get_spectra=true)
|
|
34
|
+
if @version < '3.0'
|
|
35
|
+
scan = new_scan_from_hash(scan_n)
|
|
36
|
+
precs = []
|
|
37
|
+
scan_n.each_child do |node|
|
|
38
|
+
case node.name
|
|
39
|
+
when 'precursorMz'
|
|
40
|
+
# should be able to do this!!!
|
|
41
|
+
#scan[5] = scan_n.find('child::precursorMz').map do |prec_n|
|
|
42
|
+
prec = MS::Precursor.new
|
|
43
|
+
prec[1] = node['precursorIntensity'].to_f
|
|
44
|
+
prec[0] = node.content.to_f
|
|
45
|
+
if x = node['precursorScanNum']
|
|
46
|
+
prec[2] = scans_by_num[x.to_i]
|
|
47
|
+
end
|
|
48
|
+
precs << prec
|
|
49
|
+
when 'peaks'
|
|
50
|
+
next unless get_spectra
|
|
51
|
+
# SHOULD be able to do this!!
|
|
52
|
+
#peaks_n = scan_n.find_first('child::peaks')
|
|
53
|
+
scan[6] = MS::Spectrum.from_base64_peaks(node.content, node['precision'].to_i)
|
|
54
|
+
end
|
|
55
|
+
end
|
|
56
|
+
scan[5] = precs
|
|
57
|
+
scan
|
|
58
|
+
else # for version > 3.0
|
|
59
|
+
abort 'not supporting version 3.0 just yet'
|
|
60
|
+
# note that mzXML version 3.0 *can* have more than one peak...
|
|
61
|
+
# I'm not sure how to deal with that since I have one spectrum/scan
|
|
62
|
+
end
|
|
63
|
+
end
|
|
64
|
+
|
|
65
|
+
|
|
66
|
+
# returns an array of msrun objects
|
|
67
|
+
def msruns(file)
|
|
68
|
+
raise NotImplementedError
|
|
69
|
+
end
|
|
70
|
+
|
|
71
|
+
# returns a string with double </scan></scan> tags into single and missing
|
|
72
|
+
# </scan> tags after peaks added in
|
|
73
|
+
# we do this in windows style since these are generated off a windows
|
|
74
|
+
# machine only
|
|
75
|
+
def fix_bad_scan_tags(file)
|
|
76
|
+
IO.read(file).gsub(/<\/scan>\s+<\/scan>/m, '</scan>').gsub(/<\/peaks>\s+<scan/m, "</peaks>\r\n </scan>\r\n <scan")
|
|
77
|
+
end
|
|
78
|
+
|
|
79
|
+
# right now cannot parse multiple runs out of an mzXML version 2 file since
|
|
80
|
+
# this is built around a single run per file
|
|
81
|
+
# OPTIONS:
|
|
82
|
+
# :msrun => MSRun # use this object instead of creating one
|
|
83
|
+
# :spectra => *true|false # if false don't get spectra
|
|
84
|
+
def msrun(file, opts={})
|
|
85
|
+
unless opts.key?(:spectra)
|
|
86
|
+
opts[:spectra] = true
|
|
87
|
+
end
|
|
88
|
+
|
|
89
|
+
msrun_obj =
|
|
90
|
+
if x = opts[:msrun]
|
|
91
|
+
msrun_obj = x
|
|
92
|
+
else
|
|
93
|
+
MS::MSRun.new
|
|
94
|
+
end
|
|
95
|
+
|
|
96
|
+
doc = File.open(file) {|fh| ::Hpricot.XML(fh) }
|
|
97
|
+
#if @version == '2.0'
|
|
98
|
+
# # may not be necessary in hpricot!
|
|
99
|
+
# #string = fix_bad_scan_tags(file)
|
|
100
|
+
# #XML::Parser.string(string).parse
|
|
101
|
+
#else
|
|
102
|
+
# XML::Document.file(file)
|
|
103
|
+
#end
|
|
104
|
+
msrun_n = doc.at('msRun')
|
|
105
|
+
|
|
106
|
+
## HEADER
|
|
107
|
+
scan_count = msrun_n['scanCount'].to_i
|
|
108
|
+
msrun_obj.scan_count = scan_count
|
|
109
|
+
scans_by_num = Array.new(scan_count + 1)
|
|
110
|
+
|
|
111
|
+
## SPECTRUM
|
|
112
|
+
parent = nil
|
|
113
|
+
scans = Array.new( scan_count )
|
|
114
|
+
scn_index = 0
|
|
115
|
+
|
|
116
|
+
# we should be able to do this, but it's not working!!!
|
|
117
|
+
#scan_n = msrun_n.find_first('scan')
|
|
118
|
+
#while (scn_index < scan_count)
|
|
119
|
+
get_spectra = opts[:spectra]
|
|
120
|
+
|
|
121
|
+
msrun_n.each_child do |scan_n|
|
|
122
|
+
p scan_n
|
|
123
|
+
abort
|
|
124
|
+
|
|
125
|
+
next unless scan_n.name == 'scan'
|
|
126
|
+
scan = create_scan(scan_n, scans_by_num, get_spectra)
|
|
127
|
+
scans[scn_index] = scan
|
|
128
|
+
sc = scan_n.next
|
|
129
|
+
scans_by_num[scan[0]] = scan
|
|
130
|
+
scn_index += 1
|
|
131
|
+
end
|
|
132
|
+
|
|
133
|
+
|
|
134
|
+
## update the scan's parents
|
|
135
|
+
MS::MSRun.add_parent_scan(scans)
|
|
136
|
+
|
|
137
|
+
# note that startTime and endTime are optional AND in >2.2 are dateTime
|
|
138
|
+
# instead of duration types!, so we will just use scan times...
|
|
139
|
+
# Also, note that startTime and endTime are BROKEN on readw -> mzXML 2.0
|
|
140
|
+
# export. They give the start and end time in seconds, but they are
|
|
141
|
+
# really minutes. All the more reason to use the first and last scans!
|
|
142
|
+
msrun_obj.start_time = scans.first.time
|
|
143
|
+
msrun_obj.end_time = scans.last.time
|
|
144
|
+
|
|
145
|
+
msrun_obj.scans = scans
|
|
146
|
+
end
|
|
147
|
+
|
|
148
|
+
end
|
|
149
|
+
|
|
150
|
+
|
|
151
|
+
|
|
152
|
+
=begin
|
|
153
|
+
## THIS IS THE SAX PARSER VERSION. IT NEEDS A BIT OF BRUSH UP AND IT WOULD
|
|
154
|
+
## WORK. I THINK THE default guy is probably faster
|
|
155
|
+
|
|
156
|
+
def msrun(file, msrun_obj)
|
|
157
|
+
# Figure out where the first scan is at in the file:
|
|
158
|
+
pos_after_first_scan = nil
|
|
159
|
+
File.open(file) do |fh|
|
|
160
|
+
fh.each do |line|
|
|
161
|
+
if line =~ /<scan/
|
|
162
|
+
pos_after_first_scan = fh.pos
|
|
163
|
+
end
|
|
164
|
+
end
|
|
165
|
+
end
|
|
166
|
+
|
|
167
|
+
# Get only the header:
|
|
168
|
+
header_string = IO.read(file, pos_after_first_scan)
|
|
169
|
+
|
|
170
|
+
@msrun_obj = msrun_obj
|
|
171
|
+
# Parse out the header info:
|
|
172
|
+
parser = XML::SaxParser.new
|
|
173
|
+
parser.string = header_string
|
|
174
|
+
parser.on_start_element do |name, attrs|
|
|
175
|
+
if name == 'msRun'
|
|
176
|
+
@msrun_obj.scan_count = attrs['scanCount'].to_i
|
|
177
|
+
@msrun_obj.start_time = attrs['startTime'][2...-1].to_f
|
|
178
|
+
@msrun_obj.end_time = attrs['endTime'][2...-1].to_f
|
|
179
|
+
end
|
|
180
|
+
end
|
|
181
|
+
parser.parse
|
|
182
|
+
|
|
183
|
+
|
|
184
|
+
# Parse the scans out:
|
|
185
|
+
scan_st = 'scan'
|
|
186
|
+
prec_st = 'precursorMz'
|
|
187
|
+
peaks_st = 'peaks'
|
|
188
|
+
prec_inten_st = 'precursorIntensity'
|
|
189
|
+
precision_st = 'precision'
|
|
190
|
+
|
|
191
|
+
#parser = MS::Parser::MzXML::Hpricot::SaxParser::MSRun.new
|
|
192
|
+
parser = XML::SaxParser.new
|
|
193
|
+
parser.filename = file
|
|
194
|
+
parser.on_start_document do
|
|
195
|
+
@scans = []
|
|
196
|
+
@current_scan = nil
|
|
197
|
+
@get_peaks = false
|
|
198
|
+
@get_prec_mz = false
|
|
199
|
+
end
|
|
200
|
+
|
|
201
|
+
parser.on_characters do |chars|
|
|
202
|
+
if @get_peaks
|
|
203
|
+
@get_peaks << chars
|
|
204
|
+
elsif @get_prec_mz
|
|
205
|
+
@get_prec_mz << chars
|
|
206
|
+
end
|
|
207
|
+
end
|
|
208
|
+
|
|
209
|
+
parser.on_end_element do |el|
|
|
210
|
+
case el
|
|
211
|
+
when 'peaks'
|
|
212
|
+
@current_scan.spectrum = Spectrum.from_base64_peaks(@get_peaks, @precision, true)
|
|
213
|
+
@get_peaks = false
|
|
214
|
+
when 'precursorMz'
|
|
215
|
+
@current_scan[5] = [Precursor.new([@get_prec_mz.to_f])]
|
|
216
|
+
@get_prec_mz = false
|
|
217
|
+
end
|
|
218
|
+
end
|
|
219
|
+
|
|
220
|
+
parser.on_start_element do |name, attr_hash|
|
|
221
|
+
case name
|
|
222
|
+
when scan_st
|
|
223
|
+
@current_scan = new_scan_from_hash(attr_hash)
|
|
224
|
+
sz = @scans.size
|
|
225
|
+
@scans << @current_scan
|
|
226
|
+
when prec_st
|
|
227
|
+
@current_scan[5].first[1] = attr_hash[prec_inten_st].to_f
|
|
228
|
+
@get_prec_mz = ''
|
|
229
|
+
when peaks_st
|
|
230
|
+
@precision = attr_hash[precision_st].to_i
|
|
231
|
+
case @version[0,1].to_ip
|
|
232
|
+
when 3
|
|
233
|
+
if ch['pairOrder'] != 'm/z-int' # only version 3.0 has others
|
|
234
|
+
abort "cannot yet read anything but 'm/z-int' pair order"
|
|
235
|
+
end
|
|
236
|
+
end
|
|
237
|
+
@get_peaks = ''
|
|
238
|
+
end
|
|
239
|
+
end
|
|
240
|
+
parser.parse
|
|
241
|
+
|
|
242
|
+
@msrun_obj.scans = @scans
|
|
243
|
+
@msrun_obj.scans.each_with_index do |sc,i|
|
|
244
|
+
if sc.spectrum.mz == nil
|
|
245
|
+
abort "INDEX: #{i}"
|
|
246
|
+
end
|
|
247
|
+
end
|
|
248
|
+
@msrun_obj
|
|
249
|
+
end
|
|
250
|
+
=end
|
|
251
|
+
|
|
252
|
+
|
|
253
|
+
|
|
@@ -0,0 +1,15 @@
|
|
|
1
|
+
|
|
2
|
+
require 'ms/parser/mzxml/dom'
|
|
3
|
+
|
|
4
|
+
class MS::Parser::MzXML::LibXML < MS::Parser::MzXML::DOM
|
|
5
|
+
def goot_root_node_from_string(string)
|
|
6
|
+
XML::Parser.string(string).parse.root
|
|
7
|
+
end
|
|
8
|
+
def get_root_node_from_file(file)
|
|
9
|
+
XML::Document.file(file).root
|
|
10
|
+
end
|
|
11
|
+
end
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
|