mspire 0.4.9 → 0.5.0
Sign up to get free protection for your applications and to get access to all the features.
- data/README +27 -17
- data/changelog.txt +31 -62
- data/lib/ms/calc.rb +32 -0
- data/lib/ms/data/interleaved.rb +60 -0
- data/lib/ms/data/lazy_io.rb +73 -0
- data/lib/ms/data/lazy_string.rb +15 -0
- data/lib/ms/data/simple.rb +59 -0
- data/lib/ms/data/transposed.rb +41 -0
- data/lib/ms/data.rb +57 -0
- data/lib/ms/format/format_error.rb +12 -0
- data/lib/ms/spectrum.rb +25 -384
- data/lib/ms/support/binary_search.rb +126 -0
- data/lib/ms.rb +10 -10
- metadata +38 -350
- data/INSTALL +0 -58
- data/README.rdoc +0 -18
- data/Rakefile +0 -330
- data/bin/aafreqs.rb +0 -23
- data/bin/bioworks2excel.rb +0 -14
- data/bin/bioworks_to_pepxml.rb +0 -148
- data/bin/bioworks_to_pepxml_gui.rb +0 -225
- data/bin/fasta_shaker.rb +0 -5
- data/bin/filter_and_validate.rb +0 -5
- data/bin/gi2annot.rb +0 -14
- data/bin/id_class_anal.rb +0 -112
- data/bin/id_precision.rb +0 -172
- data/bin/ms_to_lmat.rb +0 -67
- data/bin/pepproph_filter.rb +0 -16
- data/bin/prob_validate.rb +0 -6
- data/bin/protein_summary.rb +0 -6
- data/bin/protxml2prots_peps.rb +0 -32
- data/bin/raw_to_mzXML.rb +0 -55
- data/bin/run_percolator.rb +0 -122
- data/bin/sqt_group.rb +0 -26
- data/bin/srf_group.rb +0 -27
- data/bin/srf_to_sqt.rb +0 -40
- data/lib/align/chams.rb +0 -78
- data/lib/align.rb +0 -154
- data/lib/archive/targz.rb +0 -94
- data/lib/bsearch.rb +0 -120
- data/lib/core_extensions.rb +0 -16
- data/lib/fasta.rb +0 -626
- data/lib/gi.rb +0 -124
- data/lib/group_by.rb +0 -10
- data/lib/index_by.rb +0 -11
- data/lib/merge_deep.rb +0 -21
- data/lib/ms/converter/mzxml.rb +0 -77
- data/lib/ms/gradient_program.rb +0 -170
- data/lib/ms/msrun.rb +0 -244
- data/lib/ms/msrun_index.rb +0 -108
- data/lib/ms/parser/mzdata/axml.rb +0 -67
- data/lib/ms/parser/mzdata/dom.rb +0 -175
- data/lib/ms/parser/mzdata/libxml.rb +0 -7
- data/lib/ms/parser/mzdata.rb +0 -31
- data/lib/ms/parser/mzxml/axml.rb +0 -70
- data/lib/ms/parser/mzxml/dom.rb +0 -182
- data/lib/ms/parser/mzxml/hpricot.rb +0 -253
- data/lib/ms/parser/mzxml/libxml.rb +0 -19
- data/lib/ms/parser/mzxml/regexp.rb +0 -122
- data/lib/ms/parser/mzxml/rexml.rb +0 -72
- data/lib/ms/parser/mzxml/xmlparser.rb +0 -248
- data/lib/ms/parser/mzxml.rb +0 -282
- data/lib/ms/parser.rb +0 -108
- data/lib/ms/precursor.rb +0 -25
- data/lib/ms/scan.rb +0 -81
- data/lib/mspire.rb +0 -4
- data/lib/pi_zero.rb +0 -244
- data/lib/qvalue.rb +0 -161
- data/lib/roc.rb +0 -187
- data/lib/sample_enzyme.rb +0 -160
- data/lib/scan_i.rb +0 -21
- data/lib/spec_id/aa_freqs.rb +0 -170
- data/lib/spec_id/bioworks.rb +0 -497
- data/lib/spec_id/digestor.rb +0 -138
- data/lib/spec_id/mass.rb +0 -179
- data/lib/spec_id/parser/proph.rb +0 -335
- data/lib/spec_id/precision/filter/cmdline.rb +0 -218
- data/lib/spec_id/precision/filter/interactive.rb +0 -134
- data/lib/spec_id/precision/filter/output.rb +0 -148
- data/lib/spec_id/precision/filter.rb +0 -637
- data/lib/spec_id/precision/output.rb +0 -60
- data/lib/spec_id/precision/prob/cmdline.rb +0 -160
- data/lib/spec_id/precision/prob/output.rb +0 -94
- data/lib/spec_id/precision/prob.rb +0 -249
- data/lib/spec_id/proph/pep_summary.rb +0 -104
- data/lib/spec_id/proph/prot_summary.rb +0 -484
- data/lib/spec_id/proph.rb +0 -4
- data/lib/spec_id/protein_summary.rb +0 -489
- data/lib/spec_id/sequest/params.rb +0 -316
- data/lib/spec_id/sequest/pepxml.rb +0 -1458
- data/lib/spec_id/sequest.rb +0 -33
- data/lib/spec_id/sqt.rb +0 -349
- data/lib/spec_id/srf.rb +0 -973
- data/lib/spec_id.rb +0 -778
- data/lib/spec_id_xml.rb +0 -99
- data/lib/transmem/phobius.rb +0 -147
- data/lib/transmem/toppred.rb +0 -368
- data/lib/transmem.rb +0 -157
- data/lib/validator/aa.rb +0 -48
- data/lib/validator/aa_est.rb +0 -112
- data/lib/validator/background.rb +0 -77
- data/lib/validator/bias.rb +0 -95
- data/lib/validator/cmdline.rb +0 -431
- data/lib/validator/decoy.rb +0 -107
- data/lib/validator/digestion_based.rb +0 -70
- data/lib/validator/probability.rb +0 -51
- data/lib/validator/prot_from_pep.rb +0 -234
- data/lib/validator/q_value.rb +0 -32
- data/lib/validator/transmem.rb +0 -272
- data/lib/validator/true_pos.rb +0 -46
- data/lib/validator.rb +0 -197
- data/lib/xml.rb +0 -38
- data/lib/xml_style_parser.rb +0 -119
- data/lib/xmlparser_wrapper.rb +0 -19
- data/release_notes.txt +0 -2
- data/script/compile_and_plot_smriti_final.rb +0 -97
- data/script/create_little_pepxml.rb +0 -61
- data/script/degenerate_peptides.rb +0 -47
- data/script/estimate_fpr_by_cysteine.rb +0 -226
- data/script/extract_gradient_programs.rb +0 -56
- data/script/find_cysteine_background.rb +0 -137
- data/script/genuine_tps_and_probs.rb +0 -136
- data/script/get_apex_values_rexml.rb +0 -44
- data/script/histogram_probs.rb +0 -61
- data/script/mascot_fix_pepxml.rb +0 -123
- data/script/msvis.rb +0 -42
- data/script/mzXML2timeIndex.rb +0 -25
- data/script/peps_per_bin.rb +0 -67
- data/script/prep_dir.rb +0 -121
- data/script/simple_protein_digestion.rb +0 -27
- data/script/smriti_final_analysis.rb +0 -103
- data/script/sqt_to_meta.rb +0 -24
- data/script/top_hit_per_scan.rb +0 -67
- data/script/toppred_to_yaml.rb +0 -47
- data/script/tpp_installer.rb +0 -249
- data/specs/align_spec.rb +0 -79
- data/specs/bin/bioworks_to_pepxml_spec.rb +0 -79
- data/specs/bin/fasta_shaker_spec.rb +0 -259
- data/specs/bin/filter_and_validate__multiple_vals_helper.yaml +0 -199
- data/specs/bin/filter_and_validate_spec.rb +0 -180
- data/specs/bin/ms_to_lmat_spec.rb +0 -34
- data/specs/bin/prob_validate_spec.rb +0 -86
- data/specs/bin/protein_summary_spec.rb +0 -14
- data/specs/fasta_spec.rb +0 -354
- data/specs/gi_spec.rb +0 -22
- data/specs/load_bin_path.rb +0 -7
- data/specs/merge_deep_spec.rb +0 -13
- data/specs/ms/gradient_program_spec.rb +0 -77
- data/specs/ms/msrun_spec.rb +0 -498
- data/specs/ms/parser_spec.rb +0 -92
- data/specs/ms/spectrum_spec.rb +0 -87
- data/specs/pi_zero_spec.rb +0 -115
- data/specs/qvalue_spec.rb +0 -39
- data/specs/roc_spec.rb +0 -251
- data/specs/rspec_autotest.rb +0 -149
- data/specs/sample_enzyme_spec.rb +0 -126
- data/specs/spec_helper.rb +0 -135
- data/specs/spec_id/aa_freqs_spec.rb +0 -52
- data/specs/spec_id/bioworks_spec.rb +0 -148
- data/specs/spec_id/digestor_spec.rb +0 -75
- data/specs/spec_id/precision/filter/cmdline_spec.rb +0 -20
- data/specs/spec_id/precision/filter/output_spec.rb +0 -31
- data/specs/spec_id/precision/filter_spec.rb +0 -246
- data/specs/spec_id/precision/prob_spec.rb +0 -44
- data/specs/spec_id/precision/prob_spec_helper.rb +0 -0
- data/specs/spec_id/proph/pep_summary_spec.rb +0 -98
- data/specs/spec_id/proph/prot_summary_spec.rb +0 -128
- data/specs/spec_id/protein_summary_spec.rb +0 -189
- data/specs/spec_id/sequest/params_spec.rb +0 -68
- data/specs/spec_id/sequest/pepxml_spec.rb +0 -374
- data/specs/spec_id/sequest_spec.rb +0 -38
- data/specs/spec_id/sqt_spec.rb +0 -246
- data/specs/spec_id/srf_spec.rb +0 -172
- data/specs/spec_id/srf_spec_helper.rb +0 -139
- data/specs/spec_id_helper.rb +0 -33
- data/specs/spec_id_spec.rb +0 -366
- data/specs/spec_id_xml_spec.rb +0 -33
- data/specs/transmem/phobius_spec.rb +0 -425
- data/specs/transmem/toppred_spec.rb +0 -298
- data/specs/transmem_spec.rb +0 -60
- data/specs/transmem_spec_shared.rb +0 -64
- data/specs/validator/aa_est_spec.rb +0 -66
- data/specs/validator/aa_spec.rb +0 -40
- data/specs/validator/background_spec.rb +0 -67
- data/specs/validator/bias_spec.rb +0 -122
- data/specs/validator/decoy_spec.rb +0 -51
- data/specs/validator/fasta_helper.rb +0 -26
- data/specs/validator/prot_from_pep_spec.rb +0 -141
- data/specs/validator/transmem_spec.rb +0 -146
- data/specs/validator/true_pos_spec.rb +0 -58
- data/specs/validator_helper.rb +0 -33
- data/specs/xml_spec.rb +0 -12
- data/test_files/000_pepxml18_small.xml +0 -206
- data/test_files/020a.mzXML.timeIndex +0 -4710
- data/test_files/4-03-03_mzXML/000.mzXML.timeIndex +0 -3973
- data/test_files/4-03-03_mzXML/020.mzXML.timeIndex +0 -3872
- data/test_files/4-03-03_small-prot.xml +0 -321
- data/test_files/4-03-03_small.xml +0 -3876
- data/test_files/7MIX_STD_110802_1.sequest_params_fragment.srf +0 -0
- data/test_files/bioworks-3.3_10prots.xml +0 -5999
- data/test_files/bioworks31.params +0 -77
- data/test_files/bioworks32.params +0 -62
- data/test_files/bioworks33.params +0 -63
- data/test_files/bioworks_single_run_small.xml +0 -7237
- data/test_files/bioworks_small.fasta +0 -212
- data/test_files/bioworks_small.params +0 -63
- data/test_files/bioworks_small.phobius +0 -109
- data/test_files/bioworks_small.toppred.out +0 -2847
- data/test_files/bioworks_small.xml +0 -5610
- data/test_files/bioworks_with_INV_small.xml +0 -3753
- data/test_files/bioworks_with_SHUFF_small.xml +0 -2503
- data/test_files/corrupted_900.srf +0 -0
- data/test_files/head_of_7MIX.srf +0 -0
- data/test_files/interact-opd1_mods_small-prot.xml +0 -304
- data/test_files/messups.fasta +0 -297
- data/test_files/opd1/000.my_answer.100lines.xml +0 -101
- data/test_files/opd1/000.tpp_1.2.3.first10.xml +0 -115
- data/test_files/opd1/000.tpp_2.9.2.first10.xml +0 -126
- data/test_files/opd1/000.v2.1.mzXML.timeIndex +0 -3748
- data/test_files/opd1/000_020-prot.png +0 -0
- data/test_files/opd1/000_020_3prots-prot.mod_initprob.xml +0 -62
- data/test_files/opd1/000_020_3prots-prot.xml +0 -62
- data/test_files/opd1/opd1_cat_inv_small-prot.xml +0 -139
- data/test_files/opd1/sequest.3.1.params +0 -77
- data/test_files/opd1/sequest.3.2.params +0 -62
- data/test_files/opd1/twenty_scans.mzXML +0 -418
- data/test_files/opd1/twenty_scans.v2.1.mzXML +0 -382
- data/test_files/opd1/twenty_scans_answ.lmat +0 -0
- data/test_files/opd1/twenty_scans_answ.lmata +0 -9
- data/test_files/opd1_020_beginning.RAW +0 -0
- data/test_files/opd1_2runs_2mods/data/020.mzData.xml +0 -683
- data/test_files/opd1_2runs_2mods/data/020.readw.mzXML +0 -382
- data/test_files/opd1_2runs_2mods/data/040.mzData.xml +0 -683
- data/test_files/opd1_2runs_2mods/data/040.readw.mzXML +0 -382
- data/test_files/opd1_2runs_2mods/data/README.txt +0 -6
- data/test_files/opd1_2runs_2mods/interact-opd1_mods__small.xml +0 -753
- data/test_files/orbitrap_mzData/000_cut.xml +0 -1920
- data/test_files/pepproph_small.xml +0 -4691
- data/test_files/phobius.small.noheader.txt +0 -50
- data/test_files/phobius.small.small.txt +0 -53
- data/test_files/s01_anC1_ld020mM.key.txt +0 -25
- data/test_files/s01_anC1_ld020mM.meth +0 -0
- data/test_files/small.fasta +0 -297
- data/test_files/small.sqt +0 -87
- data/test_files/smallraw.RAW +0 -0
- data/test_files/tf_bioworks2excel.bioXML +0 -14340
- data/test_files/tf_bioworks2excel.txt.actual +0 -1035
- data/test_files/toppred.small.out +0 -416
- data/test_files/toppred.xml.out +0 -318
- data/test_files/validator_hits_separate/bias_bioworks_small_HS.fasta +0 -7
- data/test_files/validator_hits_separate/bioworks_small_HS.xml +0 -5651
- data/test_files/yeast_gly_small-prot.xml +0 -265
- data/test_files/yeast_gly_small.1.0_1.0_1.0.parentTimes +0 -6
- data/test_files/yeast_gly_small.xml +0 -3807
- data/test_files/yeast_gly_small2.parentTimes +0 -6
data/lib/ms/msrun_index.rb
DELETED
@@ -1,108 +0,0 @@
|
|
1
|
-
require 'ms/scan'
|
2
|
-
require 'ms/parser'
|
3
|
-
|
4
|
-
class MS::MSRunIndex
|
5
|
-
# basename_noext is the base name of the file (with NO extensions)
|
6
|
-
attr_accessor :scans_by_num
|
7
|
-
attr_reader :basename_noext
|
8
|
-
|
9
|
-
# identifies and removes .mzXML .mzXML.timeIndex and .timeIndex
|
10
|
-
# otherwise, removes one extension and that's the filename_noext
|
11
|
-
# Also, removes any leading path
|
12
|
-
def basename_noext=(filename)
|
13
|
-
ext = File.extname(filename)
|
14
|
-
basename = File.basename(filename)
|
15
|
-
case ext
|
16
|
-
when '.mzXML'
|
17
|
-
@basename_noext = basename.gsub(/\.mzXML$/, "")
|
18
|
-
when '.timeIndex'
|
19
|
-
@basename_noext = basename.gsub(/\.timeIndex$/, "")
|
20
|
-
if File.extname(@basename_noext) == ".mzXML"
|
21
|
-
@basename_noext.gsub!(/\.mzXML$/, "")
|
22
|
-
end
|
23
|
-
else
|
24
|
-
@basename_noext = basename.gsub(/#{Regexp.escape(ext)}/, "")
|
25
|
-
end
|
26
|
-
end
|
27
|
-
|
28
|
-
# index_file has one row for each scan:
|
29
|
-
# ms_level scan_num time [prec_mz prec_inten]
|
30
|
-
# also consider getting this data directly from the mzXML file
|
31
|
-
# via the MS::MzXML::Parser.get_msrun_index command
|
32
|
-
def set_from_index_file(index_file)
|
33
|
-
self.basename_noext = index_file
|
34
|
-
@scans_by_num = []
|
35
|
-
if index_file
|
36
|
-
File.open(index_file).each do |line|
|
37
|
-
next if line !~ /\d/ || line =~ /^#/
|
38
|
-
line.chomp!
|
39
|
-
arr = line.split(" ")
|
40
|
-
scan = MS::Scan.new(arr[1].to_i, arr[0].to_i, arr[2].to_f)
|
41
|
-
if scan.ms_level > 1
|
42
|
-
scan.prec_mz = arr[3].to_f
|
43
|
-
scan.prec_inten = arr[4].to_f
|
44
|
-
end
|
45
|
-
@scans_by_num[scan.num] = scan
|
46
|
-
end
|
47
|
-
end
|
48
|
-
MS::Scan.add_parent_scan(@scans_by_num)
|
49
|
-
end
|
50
|
-
|
51
|
-
# Takes a .mzXML file or .timeIndex file (currently)
|
52
|
-
# and creates an index of scans from it
|
53
|
-
def initialize(file=nil)
|
54
|
-
@scans_by_num = []
|
55
|
-
if file
|
56
|
-
ext = File.extname(file)
|
57
|
-
case ext
|
58
|
-
when '.mzXML'
|
59
|
-
set_from_mzxml(file)
|
60
|
-
when '.timeIndex'
|
61
|
-
set_from_index_file(file)
|
62
|
-
else
|
63
|
-
raise ArgumentError, "#{self.class}.new doesn't recognize files of extension: #{ext}"
|
64
|
-
end
|
65
|
-
end
|
66
|
-
end
|
67
|
-
|
68
|
-
|
69
|
-
# returns a new
|
70
|
-
def set_from_mzxml(file)
|
71
|
-
self.basename_noext = file
|
72
|
-
@scans_by_num = MS::Parser.new(file, :scans_by_num).parse(file)
|
73
|
-
end
|
74
|
-
|
75
|
-
# writes the index to filename
|
76
|
-
# each line:
|
77
|
-
# ms_level scan_num time (if !ms_level=1) { prec_mz prec_intensity)
|
78
|
-
def to_index_file(filename)
|
79
|
-
strings = []
|
80
|
-
@scans_by_num.each do |scan|
|
81
|
-
if scan
|
82
|
-
strings << scan.to_index_file_string
|
83
|
-
end
|
84
|
-
end
|
85
|
-
File.open(filename, "w") do |fh|
|
86
|
-
fh.print strings.join("\n")
|
87
|
-
end
|
88
|
-
end
|
89
|
-
|
90
|
-
# returns an array of the times of the precursor scan's parent (not its own
|
91
|
-
# acquisition time). The parent scan index will also retrieve the time of
|
92
|
-
# the parent scan.
|
93
|
-
def parent_times_by_scan_num
|
94
|
-
by_num = []
|
95
|
-
parent_time = nil
|
96
|
-
@scans_by_num.each_with_index do |scan,i|
|
97
|
-
if scan.ms_level == 1
|
98
|
-
parent_time = scan.time
|
99
|
-
end
|
100
|
-
by_num[i] = parent_time
|
101
|
-
end
|
102
|
-
by_num
|
103
|
-
end
|
104
|
-
|
105
|
-
end
|
106
|
-
|
107
|
-
|
108
|
-
|
@@ -1,67 +0,0 @@
|
|
1
|
-
require 'ms/parser/mzdata/dom'
|
2
|
-
|
3
|
-
class MS::Parser::MzData::AXML < MS::Parser::MzData::DOM
|
4
|
-
def get_root_node_from_file(file)
|
5
|
-
::AXML.parse_file(file)
|
6
|
-
end
|
7
|
-
def get_root_node_from_io(io)
|
8
|
-
::AXML.parse(io)
|
9
|
-
end
|
10
|
-
end
|
11
|
-
|
12
|
-
class MS::Parser::MzData::AXML::LazyData < MS::Parser::MzData::AXML
|
13
|
-
def get_root_node_from_string(string)
|
14
|
-
::AXML::LazyData.parse(string)
|
15
|
-
end
|
16
|
-
def get_root_node_from_file(file)
|
17
|
-
::AXML::LazyData.parse_file(file)
|
18
|
-
end
|
19
|
-
def get_root_node_from_io(io)
|
20
|
-
::AXML::LazyData.parse(io)
|
21
|
-
end
|
22
|
-
end
|
23
|
-
|
24
|
-
class AXML::LazyData < AXML
|
25
|
-
# Returns the root node (as Element) or nodes (as Array)
|
26
|
-
def self.parse(stream)
|
27
|
-
parser = ::AXML::XMLParser::LazyData.new
|
28
|
-
parser.parse(stream)
|
29
|
-
parser.root
|
30
|
-
end
|
31
|
-
end
|
32
|
-
|
33
|
-
# This parser stores information about where the data (peaks) information is
|
34
|
-
# in the file
|
35
|
-
# The content of the data node is an array where the first member is the
|
36
|
-
# start index and the last member is the number of bytes. All other members
|
37
|
-
# should be ignored.
|
38
|
-
class AXML::XMLParser::LazyData < ::AXML::XMLParser
|
39
|
-
|
40
|
-
def startElement(name, attributes)
|
41
|
-
text =
|
42
|
-
if name == 'data' ; []
|
43
|
-
else ; ''
|
44
|
-
end
|
45
|
-
new_el = ::AXML::El.new(@cur, name, attributes, text, [])
|
46
|
-
# add the new node to the previous parent node
|
47
|
-
@cur.add_node(new_el)
|
48
|
-
# notice the change in @cur node
|
49
|
-
@cur = new_el
|
50
|
-
end
|
51
|
-
|
52
|
-
def character(data)
|
53
|
-
if @cur.text.is_a? Array
|
54
|
-
@cur.text << byteIndex
|
55
|
-
else
|
56
|
-
@cur.text << data
|
57
|
-
end
|
58
|
-
end
|
59
|
-
|
60
|
-
def endElement(name)
|
61
|
-
if @cur.text.is_a? Array
|
62
|
-
@cur.text << (byteIndex - @cur.text.first)
|
63
|
-
end
|
64
|
-
@cur = @cur.parent
|
65
|
-
end
|
66
|
-
|
67
|
-
end
|
data/lib/ms/parser/mzdata/dom.rb
DELETED
@@ -1,175 +0,0 @@
|
|
1
|
-
require 'xml_style_parser'
|
2
|
-
require 'ms/spectrum'
|
3
|
-
require 'ms/scan'
|
4
|
-
|
5
|
-
module MS::Parser::MzData ; end
|
6
|
-
|
7
|
-
class MS::Parser::MzData::DOM
|
8
|
-
include XMLStyleParser
|
9
|
-
include MS::Parser::MzData
|
10
|
-
|
11
|
-
def initialize(parse_type=:msrun, version='1.0')
|
12
|
-
@method = parse_type
|
13
|
-
@version = version
|
14
|
-
end
|
15
|
-
|
16
|
-
# true if there is a node <dataProcessing><software><name>Bioworks Browser</...>
|
17
|
-
# otherwise false
|
18
|
-
def is_bioworks33?(description_node)
|
19
|
-
begin
|
20
|
-
software_node = description_node.find_first('child::dataProcessing').find_first('child::software')
|
21
|
-
name = software_node.find_first('child::name').content
|
22
|
-
version = software_node.find_first('child::version').content
|
23
|
-
((name == 'Bioworks Browser') and (version == '3.3'))
|
24
|
-
rescue
|
25
|
-
false
|
26
|
-
end
|
27
|
-
end
|
28
|
-
|
29
|
-
# OPTIONS:
|
30
|
-
# :msrun => MSRun # use this object instead of creating one
|
31
|
-
def msrun(file, opts={})
|
32
|
-
msrun_obj =
|
33
|
-
if x = opts[:msrun]
|
34
|
-
msrun_obj = x
|
35
|
-
else
|
36
|
-
MS::MSRun.new
|
37
|
-
end
|
38
|
-
# should ensure that parsing is not counting spaces...
|
39
|
-
|
40
|
-
# a string we'd parse like this:
|
41
|
-
# doc = XML::Parser.string(st).parse
|
42
|
-
|
43
|
-
# WE NEED TO GET scan_count, start_time and end_time!!!!
|
44
|
-
id_to_scan_hash = {}
|
45
|
-
|
46
|
-
# 0 1 2 3 4 5 6
|
47
|
-
# %w(num msLevel retentionTime startMz endMz precursor spectrum)
|
48
|
-
|
49
|
-
io =
|
50
|
-
if file.is_a? String
|
51
|
-
filename = file
|
52
|
-
File.open(file)
|
53
|
-
else
|
54
|
-
file
|
55
|
-
end
|
56
|
-
root = get_root_node_from_io(io)
|
57
|
-
|
58
|
-
|
59
|
-
description = root.find_first('child::description')
|
60
|
-
bioworks33 = is_bioworks33?(description)
|
61
|
-
spectrum_list = description.next
|
62
|
-
|
63
|
-
scans = []
|
64
|
-
|
65
|
-
# bioworks 33 gives incorrect scan count
|
66
|
-
stated_num_scans = spectrum_list['count'].to_i
|
67
|
-
|
68
|
-
# if I move from node to node, it means I've checked that it's a sequence
|
69
|
-
# and that the elements are req'd
|
70
|
-
if spectrum_list.child?
|
71
|
-
spectrum_n = spectrum_list.child
|
72
|
-
loop do
|
73
|
-
scan = MS::Scan.new(9)
|
74
|
-
id = spectrum_n["id"].to_i
|
75
|
-
id_to_scan_hash[id] = scan
|
76
|
-
spec_desc_n = spectrum_n.child # required in sequence
|
77
|
-
spec_settings_n = spec_desc_n.child # required in sequence
|
78
|
-
if acq_n = spec_settings_n.find_first('descendant::acquisition')
|
79
|
-
scan[0] = acq_n['acqNumber'].to_i
|
80
|
-
else
|
81
|
-
scan[0] = id
|
82
|
-
end
|
83
|
-
spec_inst_n = spec_settings_n.find_first('child::spectrumInstrument')
|
84
|
-
scan[1] = spec_inst_n['msLevel'].to_i
|
85
|
-
|
86
|
-
# we could use a scan_count, but in bioworks 33, we can't trust the
|
87
|
-
# scan count! So, we just collect them
|
88
|
-
scans << scan
|
89
|
-
|
90
|
-
scan[3] = spec_inst_n['mzRangeStart'].to_f
|
91
|
-
scan[4] = spec_inst_n['mzRangeStop'].to_f
|
92
|
-
spec_inst_n.find('child::cvParam').each do |cv_param|
|
93
|
-
if cv_param['name'] == 'TimeInMinutes'
|
94
|
-
scan[2] = cv_param['value'].to_f * 60 #convert to seconds
|
95
|
-
end
|
96
|
-
end
|
97
|
-
if scan[1] > 1 # precursormz info
|
98
|
-
prec_list_n = spec_settings_n.next
|
99
|
-
raise RuntimeError, "MSRun objects can only accept 1 precursor" if prec_list_n['count'] != '1'
|
100
|
-
prec_n = prec_list_n.find_first('child::precursor')
|
101
|
-
# %w(mz inten parent ms_level parent charge_states)
|
102
|
-
prec = MS::Precursor.new
|
103
|
-
unless bioworks33 # bioworks33 points to the wrong scan!!!
|
104
|
-
prec[2] = id_to_scan_hash[prec_n['spectrumRef'].to_i]
|
105
|
-
end
|
106
|
-
# we're not keeping track of this guy anymore
|
107
|
-
# prec[3] = prec_n['msLevel'].to_i
|
108
|
-
charges = []
|
109
|
-
prec_n.find('descendant::cvParam').each do |cv_param_n|
|
110
|
-
case cv_param_n['name']
|
111
|
-
when 'MassToChargeRatio'
|
112
|
-
prec[0] = cv_param_n['value'].to_f
|
113
|
-
# find the prec intensity
|
114
|
-
unless bioworks33
|
115
|
-
prec[1] = prec[2].spectrum.intensity_at_mz(prec[0])
|
116
|
-
end
|
117
|
-
when 'ChargeState'
|
118
|
-
charges << cv_param_n['value'].to_i
|
119
|
-
end
|
120
|
-
end
|
121
|
-
prec[3] = charges
|
122
|
-
scan[5] = prec
|
123
|
-
else # no precursors
|
124
|
-
scan[5] = nil
|
125
|
-
end
|
126
|
-
# here's the one line way of doing it, but it's probably more clear in
|
127
|
-
# the loop
|
128
|
-
#while ((mz_array_bin_n = spec_desc_n.next).name != 'mzArrayBinary') do
|
129
|
-
unless opts[:lazy] == :no_spectra
|
130
|
-
mz_array_bin_n = nil
|
131
|
-
loop do
|
132
|
-
mz_array_bin_n = spec_desc_n.next
|
133
|
-
break if mz_array_bin_n.name == 'mzArrayBinary'
|
134
|
-
end
|
135
|
-
mz_data_n = mz_array_bin_n.child
|
136
|
-
inten_array_bin_n = mz_array_bin_n.next
|
137
|
-
inten_data_n = inten_array_bin_n.child
|
138
|
-
case opts[:lazy]
|
139
|
-
when :string
|
140
|
-
scan[6] = MS::Spectrum::LazyString.from_base64_pair(mz_data_n.content, mz_data_n['precision'].to_i, ((mz_data_n['endian']=='little') ? false : true), inten_data_n.content, inten_data_n['precision'].to_i, ((inten_data_n['endian']=='little') ? false : true) )
|
141
|
-
when :io
|
142
|
-
mz_data_n_content = mz_data_n.content
|
143
|
-
i_data_n_content = inten_data_n.content
|
144
|
-
scan[6] = MS::Spectrum::LazyIO.new(io, mz_data_n_content.first, mz_data_n_content.last, mz_data_n['precision'].to_i, ((mz_data_n['endian']=='little') ? false : true), i_data_n_content.first, i_data_n_content.last, inten_data_n['precision'].to_i, ((inten_data_n['endian']=='little') ? false : true))
|
145
|
-
when :not
|
146
|
-
mz = MS::Spectrum.base64_to_array(mz_data_n.content, mz_data_n['precision'].to_i, ((mz_data_n['endian']=='little') ? false : true))
|
147
|
-
inten = MS::Spectrum.base64_to_array(inten_data_n.content, inten_data_n['precision'].to_i, ((inten_data_n['endian']=='little') ? false : true))
|
148
|
-
scan[6] = MS::Spectrum.new(mz, inten)
|
149
|
-
end
|
150
|
-
end
|
151
|
-
|
152
|
-
# set up the next loop
|
153
|
-
break unless spectrum_n = spectrum_n.next
|
154
|
-
end
|
155
|
-
end
|
156
|
-
if bioworks33
|
157
|
-
MS::MSRun.add_parent_scan(scans, ((opts[:lazy] == :not) ? true : false))
|
158
|
-
end
|
159
|
-
msrun_obj.scans = scans
|
160
|
-
msrun_obj.scan_count = scans.size
|
161
|
-
unless bioworks33 # we know the scan count is off here
|
162
|
-
if msrun_obj.scan_count != stated_num_scans
|
163
|
-
warn "num collected scans (#{scans.size}) does not agree with stated num scans (#{stated_num_scans})!"
|
164
|
-
end
|
165
|
-
end
|
166
|
-
msrun_obj.start_time = msrun_obj.scans.first.time
|
167
|
-
msrun_obj.end_time = msrun_obj.scans.last.time
|
168
|
-
|
169
|
-
io.close if filename
|
170
|
-
end
|
171
|
-
|
172
|
-
end
|
173
|
-
|
174
|
-
|
175
|
-
|
data/lib/ms/parser/mzdata.rb
DELETED
@@ -1,31 +0,0 @@
|
|
1
|
-
require 'ms/msrun'
|
2
|
-
|
3
|
-
module MS; end
|
4
|
-
|
5
|
-
module MS::Parser::MzData
|
6
|
-
Base_dir_for_parsers = 'ms/parser/mzdata'
|
7
|
-
|
8
|
-
# inherits XMLStyleParser and version
|
9
|
-
include MS::Parser
|
10
|
-
include XMLStyleParser
|
11
|
-
|
12
|
-
# returns a specific parser MS::Parser::MzXML::#{ParserType}
|
13
|
-
# based on choose_parser from xml_style_parser
|
14
|
-
def self.new(parse_type=:msrun, version='1.05', opts={})
|
15
|
-
special_subclass =
|
16
|
-
if opts[:lazy] == :io
|
17
|
-
'LazyData'
|
18
|
-
else ; nil
|
19
|
-
end
|
20
|
-
|
21
|
-
@version = version
|
22
|
-
@method = parse_type
|
23
|
-
#p self.methods.grep /choose_parser/
|
24
|
-
XMLStyleParser.require_parse_files(Base_dir_for_parsers)
|
25
|
-
parser_class = XMLStyleParser.choose_parser(self, parse_type, special_subclass)
|
26
|
-
parser = parser_class.new(parse_type, version)
|
27
|
-
end
|
28
|
-
|
29
|
-
end
|
30
|
-
|
31
|
-
|
data/lib/ms/parser/mzxml/axml.rb
DELETED
@@ -1,70 +0,0 @@
|
|
1
|
-
require 'ms/parser/mzxml/dom'
|
2
|
-
|
3
|
-
class MS::Parser::MzXML::AXML < MS::Parser::MzXML::DOM
|
4
|
-
def get_root_node_from_string(string)
|
5
|
-
::AXML.parse(string)
|
6
|
-
end
|
7
|
-
def get_root_node_from_file(file)
|
8
|
-
::AXML.parse_file(file)
|
9
|
-
end
|
10
|
-
def get_root_node_from_io(io)
|
11
|
-
::AXML.parse(io)
|
12
|
-
end
|
13
|
-
end
|
14
|
-
|
15
|
-
class MS::Parser::MzXML::AXML::LazyPeaks < MS::Parser::MzXML::AXML
|
16
|
-
def get_root_node_from_string(string)
|
17
|
-
::AXML::LazyPeaks.parse(string)
|
18
|
-
end
|
19
|
-
def get_root_node_from_file(file)
|
20
|
-
::AXML::LazyPeaks.parse_file(file)
|
21
|
-
end
|
22
|
-
def get_root_node_from_io(io)
|
23
|
-
::AXML::LazyPeaks.parse(io)
|
24
|
-
end
|
25
|
-
end
|
26
|
-
|
27
|
-
class AXML::LazyPeaks < AXML
|
28
|
-
# Returns the root node (as Element) or nodes (as Array)
|
29
|
-
def self.parse(stream)
|
30
|
-
parser = ::AXML::XMLParser::LazyPeaks.new
|
31
|
-
parser.parse(stream)
|
32
|
-
parser.root
|
33
|
-
end
|
34
|
-
end
|
35
|
-
|
36
|
-
# This parser stores information about where the peaks information is in the
|
37
|
-
# file
|
38
|
-
# The content of the peaks node is an array where the first member is the
|
39
|
-
# start index and the last member is the number of bytes. All other members
|
40
|
-
# should be ignored.
|
41
|
-
class AXML::XMLParser::LazyPeaks < ::AXML::XMLParser
|
42
|
-
|
43
|
-
def startElement(name, attributes)
|
44
|
-
text =
|
45
|
-
if name == 'peaks' ; []
|
46
|
-
else ; ''
|
47
|
-
end
|
48
|
-
new_el = ::AXML::El.new(@cur, name, attributes, text, [])
|
49
|
-
# add the new node to the previous parent node
|
50
|
-
@cur.add_node(new_el)
|
51
|
-
# notice the change in @cur node
|
52
|
-
@cur = new_el
|
53
|
-
end
|
54
|
-
|
55
|
-
def character(data)
|
56
|
-
if @cur.text.is_a? Array
|
57
|
-
@cur.text << byteIndex
|
58
|
-
else
|
59
|
-
@cur.text << data
|
60
|
-
end
|
61
|
-
end
|
62
|
-
|
63
|
-
def endElement(name)
|
64
|
-
if @cur.text.is_a? Array
|
65
|
-
@cur.text << (byteIndex - @cur.text.first)
|
66
|
-
end
|
67
|
-
@cur = @cur.parent
|
68
|
-
end
|
69
|
-
|
70
|
-
end
|
data/lib/ms/parser/mzxml/dom.rb
DELETED
@@ -1,182 +0,0 @@
|
|
1
|
-
require 'xml_style_parser'
|
2
|
-
require 'ms/spectrum'
|
3
|
-
require 'ms/scan'
|
4
|
-
require 'ms/parser/mzxml'
|
5
|
-
require 'tempfile'
|
6
|
-
|
7
|
-
|
8
|
-
class MS::Parser::MzXML::DOM
|
9
|
-
include XMLStyleParser
|
10
|
-
include MS::Parser::MzXML
|
11
|
-
|
12
|
-
NetworkOrder = true
|
13
|
-
|
14
|
-
#@@scan_atts = %w(num msLevel retentionTime startMz endMz precursor spectrum)
|
15
|
-
|
16
|
-
def initialize(parse_type=:msrun, version='1.0')
|
17
|
-
@method = parse_type
|
18
|
-
@version = version
|
19
|
-
end
|
20
|
-
|
21
|
-
def new_scan_from_hash(node)
|
22
|
-
scan = MS::Scan.new # array class creates one with 9 positions
|
23
|
-
scan[0] = node['num'].to_i
|
24
|
-
scan[1] = node['msLevel'].to_i
|
25
|
-
if x = node['retentionTime']
|
26
|
-
scan[2] = x[2...-1].to_f
|
27
|
-
end
|
28
|
-
if x = node['startMz']
|
29
|
-
scan[3] = x.to_f
|
30
|
-
scan[4] = node['endMz'].to_f
|
31
|
-
end
|
32
|
-
scan
|
33
|
-
end
|
34
|
-
|
35
|
-
# assumes that node contains scans and checks any scan nodes for children
|
36
|
-
def add_scan_nodes(nodes, scans, scn_index, scans_by_num, lazy, io)
|
37
|
-
nodes.each do |scan_n|
|
38
|
-
scan = create_scan(scan_n, scans_by_num, lazy, io)
|
39
|
-
scans[scn_index] = scan
|
40
|
-
scans_by_num[scan[0]] = scan
|
41
|
-
scn_index += 1
|
42
|
-
if @version > '1.0'
|
43
|
-
new_nodes = scan_n.find('child::scan')
|
44
|
-
if new_nodes.size > 0
|
45
|
-
scn_index = add_scan_nodes(new_nodes, scans, scn_index, scans_by_num, lazy, io)
|
46
|
-
end
|
47
|
-
end
|
48
|
-
end
|
49
|
-
scn_index
|
50
|
-
end
|
51
|
-
|
52
|
-
# takes a scan node and creates a scan object
|
53
|
-
# the parent scan is the one directly above it in mslevel
|
54
|
-
# lazy must be a symbol from MS::MSRun.new
|
55
|
-
def create_scan(scan_n, scans_by_num, lazy, io=nil)
|
56
|
-
scan = new_scan_from_hash(scan_n)
|
57
|
-
prec = nil
|
58
|
-
scan_n.each do |node|
|
59
|
-
case node.name
|
60
|
-
when 'precursorMz'
|
61
|
-
# should be able to do this!!!
|
62
|
-
#scan[5] = scan_n.find('child::precursorMz').map do |prec_n|
|
63
|
-
raise RuntimeError, "the msrun object can only handle one precursor!" unless prec.nil?
|
64
|
-
prec = MS::Precursor.new
|
65
|
-
prec[1] = node['precursorIntensity'].to_f
|
66
|
-
prec[0] = node.content.to_f
|
67
|
-
if x = node['precursorScanNum']
|
68
|
-
prec[2] = scans_by_num[x.to_i]
|
69
|
-
end
|
70
|
-
when 'peaks'
|
71
|
-
case lazy
|
72
|
-
when :no_spectra
|
73
|
-
next
|
74
|
-
when :string
|
75
|
-
scan[6] = MS::Spectrum::LazyString.from_base64_peaks(node.content, node['precision'].to_i)
|
76
|
-
when :io
|
77
|
-
# assumes that parsing was done with a LazyPeaks parser!
|
78
|
-
nc = node.content
|
79
|
-
scan[6] = MS::Spectrum::LazyIO.new(io, nc.first, nc.last, node['precision'].to_i, MS::Parser::MzXML::DOM::NetworkOrder)
|
80
|
-
when :not
|
81
|
-
# SHOULD be able to do this!!
|
82
|
-
#peaks_n = scan_n.find_first('child::peaks')
|
83
|
-
scan[6] = MS::Spectrum.from_base64_peaks(node.content, node['precision'].to_i)
|
84
|
-
end
|
85
|
-
end
|
86
|
-
end
|
87
|
-
scan[5] = prec
|
88
|
-
scan
|
89
|
-
end
|
90
|
-
|
91
|
-
|
92
|
-
# returns an array of msrun objects
|
93
|
-
def msruns(file)
|
94
|
-
raise NotImplementedError
|
95
|
-
end
|
96
|
-
|
97
|
-
# right now cannot parse multiple runs out of an mzXML version 2 file since
|
98
|
-
# this is built around a single run per file
|
99
|
-
# OPTIONS:
|
100
|
-
# :msrun => (an MSRun object) # use this object instead of creating one
|
101
|
-
# :lazy => [See MS::MSRun for documentation]
|
102
|
-
def msrun(file, opts={})
|
103
|
-
#unless opts.key?(:spectra)
|
104
|
-
# opts[:spectra] = true
|
105
|
-
#end
|
106
|
-
|
107
|
-
msrun_obj =
|
108
|
-
if x = opts[:msrun]
|
109
|
-
msrun_obj = x
|
110
|
-
else
|
111
|
-
MS::MSRun.new
|
112
|
-
end
|
113
|
-
|
114
|
-
io =
|
115
|
-
if file.is_a? String # a filename
|
116
|
-
filename = file
|
117
|
-
File.open(file)
|
118
|
-
else
|
119
|
-
file
|
120
|
-
end
|
121
|
-
|
122
|
-
root = get_root_node_from_io(io)
|
123
|
-
|
124
|
-
if filename
|
125
|
-
io.close # can close now
|
126
|
-
end
|
127
|
-
|
128
|
-
# right now we are only finding the first msRun (probably a rare case of
|
129
|
-
# multiple runs in an mzXML file...)
|
130
|
-
msrun_n =
|
131
|
-
if @version >= '2.0'
|
132
|
-
kids = root.children.select {|v| v.name == 'msRun' }
|
133
|
-
raise(NotImplementedError, "one msrun per doc right now" ) if kids.size > 1
|
134
|
-
kids.first
|
135
|
-
else
|
136
|
-
root
|
137
|
-
end
|
138
|
-
if msrun_n.name != 'msRun'
|
139
|
-
raise RuntimeError, "extra node slipped in somehow"
|
140
|
-
end
|
141
|
-
|
142
|
-
## HEADER
|
143
|
-
scan_count = msrun_n['scanCount'].to_i
|
144
|
-
msrun_obj.scan_count = scan_count
|
145
|
-
scans_by_num = Array.new(scan_count + 1)
|
146
|
-
|
147
|
-
## SPECTRUM
|
148
|
-
parent = nil
|
149
|
-
scans = Array.new( scan_count )
|
150
|
-
scn_index = 0
|
151
|
-
|
152
|
-
# we should be able to do this, but it's not working!!!
|
153
|
-
#scan_n = msrun_n.find_first('scan')
|
154
|
-
#while (scn_index < scan_count)
|
155
|
-
lazy = opts[:lazy]
|
156
|
-
|
157
|
-
if @version >= '3.0'
|
158
|
-
warn '[version 3.0 parsing may fail if > 1 peak list per scan]'
|
159
|
-
# note that mzXML version 3.0 *can* have more than one peak...
|
160
|
-
# I'm not sure how to deal with that since I have one spectrum/scan
|
161
|
-
end
|
162
|
-
|
163
|
-
scan_nodes = msrun_n.find('child::scan')
|
164
|
-
add_scan_nodes(scan_nodes, scans, scn_index, scans_by_num, lazy, io)
|
165
|
-
|
166
|
-
## update the scan's parents
|
167
|
-
MS::MSRun.add_parent_scan(scans)
|
168
|
-
|
169
|
-
# note that startTime and endTime are optional AND in >2.2 are dateTime
|
170
|
-
# instead of duration types!, so we will just use scan times...
|
171
|
-
# Also, note that startTime and endTime are BROKEN on readw -> mzXML 2.0
|
172
|
-
# export. They give the start and end time in seconds, but they are
|
173
|
-
# really minutes. All the more reason to use the first and last scans!
|
174
|
-
msrun_obj.start_time = scans.first.time
|
175
|
-
msrun_obj.end_time = scans.last.time
|
176
|
-
|
177
|
-
msrun_obj.scans = scans
|
178
|
-
|
179
|
-
end
|
180
|
-
end
|
181
|
-
|
182
|
-
|