mspire 0.4.9 → 0.5.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/README +27 -17
- data/changelog.txt +31 -62
- data/lib/ms/calc.rb +32 -0
- data/lib/ms/data/interleaved.rb +60 -0
- data/lib/ms/data/lazy_io.rb +73 -0
- data/lib/ms/data/lazy_string.rb +15 -0
- data/lib/ms/data/simple.rb +59 -0
- data/lib/ms/data/transposed.rb +41 -0
- data/lib/ms/data.rb +57 -0
- data/lib/ms/format/format_error.rb +12 -0
- data/lib/ms/spectrum.rb +25 -384
- data/lib/ms/support/binary_search.rb +126 -0
- data/lib/ms.rb +10 -10
- metadata +38 -350
- data/INSTALL +0 -58
- data/README.rdoc +0 -18
- data/Rakefile +0 -330
- data/bin/aafreqs.rb +0 -23
- data/bin/bioworks2excel.rb +0 -14
- data/bin/bioworks_to_pepxml.rb +0 -148
- data/bin/bioworks_to_pepxml_gui.rb +0 -225
- data/bin/fasta_shaker.rb +0 -5
- data/bin/filter_and_validate.rb +0 -5
- data/bin/gi2annot.rb +0 -14
- data/bin/id_class_anal.rb +0 -112
- data/bin/id_precision.rb +0 -172
- data/bin/ms_to_lmat.rb +0 -67
- data/bin/pepproph_filter.rb +0 -16
- data/bin/prob_validate.rb +0 -6
- data/bin/protein_summary.rb +0 -6
- data/bin/protxml2prots_peps.rb +0 -32
- data/bin/raw_to_mzXML.rb +0 -55
- data/bin/run_percolator.rb +0 -122
- data/bin/sqt_group.rb +0 -26
- data/bin/srf_group.rb +0 -27
- data/bin/srf_to_sqt.rb +0 -40
- data/lib/align/chams.rb +0 -78
- data/lib/align.rb +0 -154
- data/lib/archive/targz.rb +0 -94
- data/lib/bsearch.rb +0 -120
- data/lib/core_extensions.rb +0 -16
- data/lib/fasta.rb +0 -626
- data/lib/gi.rb +0 -124
- data/lib/group_by.rb +0 -10
- data/lib/index_by.rb +0 -11
- data/lib/merge_deep.rb +0 -21
- data/lib/ms/converter/mzxml.rb +0 -77
- data/lib/ms/gradient_program.rb +0 -170
- data/lib/ms/msrun.rb +0 -244
- data/lib/ms/msrun_index.rb +0 -108
- data/lib/ms/parser/mzdata/axml.rb +0 -67
- data/lib/ms/parser/mzdata/dom.rb +0 -175
- data/lib/ms/parser/mzdata/libxml.rb +0 -7
- data/lib/ms/parser/mzdata.rb +0 -31
- data/lib/ms/parser/mzxml/axml.rb +0 -70
- data/lib/ms/parser/mzxml/dom.rb +0 -182
- data/lib/ms/parser/mzxml/hpricot.rb +0 -253
- data/lib/ms/parser/mzxml/libxml.rb +0 -19
- data/lib/ms/parser/mzxml/regexp.rb +0 -122
- data/lib/ms/parser/mzxml/rexml.rb +0 -72
- data/lib/ms/parser/mzxml/xmlparser.rb +0 -248
- data/lib/ms/parser/mzxml.rb +0 -282
- data/lib/ms/parser.rb +0 -108
- data/lib/ms/precursor.rb +0 -25
- data/lib/ms/scan.rb +0 -81
- data/lib/mspire.rb +0 -4
- data/lib/pi_zero.rb +0 -244
- data/lib/qvalue.rb +0 -161
- data/lib/roc.rb +0 -187
- data/lib/sample_enzyme.rb +0 -160
- data/lib/scan_i.rb +0 -21
- data/lib/spec_id/aa_freqs.rb +0 -170
- data/lib/spec_id/bioworks.rb +0 -497
- data/lib/spec_id/digestor.rb +0 -138
- data/lib/spec_id/mass.rb +0 -179
- data/lib/spec_id/parser/proph.rb +0 -335
- data/lib/spec_id/precision/filter/cmdline.rb +0 -218
- data/lib/spec_id/precision/filter/interactive.rb +0 -134
- data/lib/spec_id/precision/filter/output.rb +0 -148
- data/lib/spec_id/precision/filter.rb +0 -637
- data/lib/spec_id/precision/output.rb +0 -60
- data/lib/spec_id/precision/prob/cmdline.rb +0 -160
- data/lib/spec_id/precision/prob/output.rb +0 -94
- data/lib/spec_id/precision/prob.rb +0 -249
- data/lib/spec_id/proph/pep_summary.rb +0 -104
- data/lib/spec_id/proph/prot_summary.rb +0 -484
- data/lib/spec_id/proph.rb +0 -4
- data/lib/spec_id/protein_summary.rb +0 -489
- data/lib/spec_id/sequest/params.rb +0 -316
- data/lib/spec_id/sequest/pepxml.rb +0 -1458
- data/lib/spec_id/sequest.rb +0 -33
- data/lib/spec_id/sqt.rb +0 -349
- data/lib/spec_id/srf.rb +0 -973
- data/lib/spec_id.rb +0 -778
- data/lib/spec_id_xml.rb +0 -99
- data/lib/transmem/phobius.rb +0 -147
- data/lib/transmem/toppred.rb +0 -368
- data/lib/transmem.rb +0 -157
- data/lib/validator/aa.rb +0 -48
- data/lib/validator/aa_est.rb +0 -112
- data/lib/validator/background.rb +0 -77
- data/lib/validator/bias.rb +0 -95
- data/lib/validator/cmdline.rb +0 -431
- data/lib/validator/decoy.rb +0 -107
- data/lib/validator/digestion_based.rb +0 -70
- data/lib/validator/probability.rb +0 -51
- data/lib/validator/prot_from_pep.rb +0 -234
- data/lib/validator/q_value.rb +0 -32
- data/lib/validator/transmem.rb +0 -272
- data/lib/validator/true_pos.rb +0 -46
- data/lib/validator.rb +0 -197
- data/lib/xml.rb +0 -38
- data/lib/xml_style_parser.rb +0 -119
- data/lib/xmlparser_wrapper.rb +0 -19
- data/release_notes.txt +0 -2
- data/script/compile_and_plot_smriti_final.rb +0 -97
- data/script/create_little_pepxml.rb +0 -61
- data/script/degenerate_peptides.rb +0 -47
- data/script/estimate_fpr_by_cysteine.rb +0 -226
- data/script/extract_gradient_programs.rb +0 -56
- data/script/find_cysteine_background.rb +0 -137
- data/script/genuine_tps_and_probs.rb +0 -136
- data/script/get_apex_values_rexml.rb +0 -44
- data/script/histogram_probs.rb +0 -61
- data/script/mascot_fix_pepxml.rb +0 -123
- data/script/msvis.rb +0 -42
- data/script/mzXML2timeIndex.rb +0 -25
- data/script/peps_per_bin.rb +0 -67
- data/script/prep_dir.rb +0 -121
- data/script/simple_protein_digestion.rb +0 -27
- data/script/smriti_final_analysis.rb +0 -103
- data/script/sqt_to_meta.rb +0 -24
- data/script/top_hit_per_scan.rb +0 -67
- data/script/toppred_to_yaml.rb +0 -47
- data/script/tpp_installer.rb +0 -249
- data/specs/align_spec.rb +0 -79
- data/specs/bin/bioworks_to_pepxml_spec.rb +0 -79
- data/specs/bin/fasta_shaker_spec.rb +0 -259
- data/specs/bin/filter_and_validate__multiple_vals_helper.yaml +0 -199
- data/specs/bin/filter_and_validate_spec.rb +0 -180
- data/specs/bin/ms_to_lmat_spec.rb +0 -34
- data/specs/bin/prob_validate_spec.rb +0 -86
- data/specs/bin/protein_summary_spec.rb +0 -14
- data/specs/fasta_spec.rb +0 -354
- data/specs/gi_spec.rb +0 -22
- data/specs/load_bin_path.rb +0 -7
- data/specs/merge_deep_spec.rb +0 -13
- data/specs/ms/gradient_program_spec.rb +0 -77
- data/specs/ms/msrun_spec.rb +0 -498
- data/specs/ms/parser_spec.rb +0 -92
- data/specs/ms/spectrum_spec.rb +0 -87
- data/specs/pi_zero_spec.rb +0 -115
- data/specs/qvalue_spec.rb +0 -39
- data/specs/roc_spec.rb +0 -251
- data/specs/rspec_autotest.rb +0 -149
- data/specs/sample_enzyme_spec.rb +0 -126
- data/specs/spec_helper.rb +0 -135
- data/specs/spec_id/aa_freqs_spec.rb +0 -52
- data/specs/spec_id/bioworks_spec.rb +0 -148
- data/specs/spec_id/digestor_spec.rb +0 -75
- data/specs/spec_id/precision/filter/cmdline_spec.rb +0 -20
- data/specs/spec_id/precision/filter/output_spec.rb +0 -31
- data/specs/spec_id/precision/filter_spec.rb +0 -246
- data/specs/spec_id/precision/prob_spec.rb +0 -44
- data/specs/spec_id/precision/prob_spec_helper.rb +0 -0
- data/specs/spec_id/proph/pep_summary_spec.rb +0 -98
- data/specs/spec_id/proph/prot_summary_spec.rb +0 -128
- data/specs/spec_id/protein_summary_spec.rb +0 -189
- data/specs/spec_id/sequest/params_spec.rb +0 -68
- data/specs/spec_id/sequest/pepxml_spec.rb +0 -374
- data/specs/spec_id/sequest_spec.rb +0 -38
- data/specs/spec_id/sqt_spec.rb +0 -246
- data/specs/spec_id/srf_spec.rb +0 -172
- data/specs/spec_id/srf_spec_helper.rb +0 -139
- data/specs/spec_id_helper.rb +0 -33
- data/specs/spec_id_spec.rb +0 -366
- data/specs/spec_id_xml_spec.rb +0 -33
- data/specs/transmem/phobius_spec.rb +0 -425
- data/specs/transmem/toppred_spec.rb +0 -298
- data/specs/transmem_spec.rb +0 -60
- data/specs/transmem_spec_shared.rb +0 -64
- data/specs/validator/aa_est_spec.rb +0 -66
- data/specs/validator/aa_spec.rb +0 -40
- data/specs/validator/background_spec.rb +0 -67
- data/specs/validator/bias_spec.rb +0 -122
- data/specs/validator/decoy_spec.rb +0 -51
- data/specs/validator/fasta_helper.rb +0 -26
- data/specs/validator/prot_from_pep_spec.rb +0 -141
- data/specs/validator/transmem_spec.rb +0 -146
- data/specs/validator/true_pos_spec.rb +0 -58
- data/specs/validator_helper.rb +0 -33
- data/specs/xml_spec.rb +0 -12
- data/test_files/000_pepxml18_small.xml +0 -206
- data/test_files/020a.mzXML.timeIndex +0 -4710
- data/test_files/4-03-03_mzXML/000.mzXML.timeIndex +0 -3973
- data/test_files/4-03-03_mzXML/020.mzXML.timeIndex +0 -3872
- data/test_files/4-03-03_small-prot.xml +0 -321
- data/test_files/4-03-03_small.xml +0 -3876
- data/test_files/7MIX_STD_110802_1.sequest_params_fragment.srf +0 -0
- data/test_files/bioworks-3.3_10prots.xml +0 -5999
- data/test_files/bioworks31.params +0 -77
- data/test_files/bioworks32.params +0 -62
- data/test_files/bioworks33.params +0 -63
- data/test_files/bioworks_single_run_small.xml +0 -7237
- data/test_files/bioworks_small.fasta +0 -212
- data/test_files/bioworks_small.params +0 -63
- data/test_files/bioworks_small.phobius +0 -109
- data/test_files/bioworks_small.toppred.out +0 -2847
- data/test_files/bioworks_small.xml +0 -5610
- data/test_files/bioworks_with_INV_small.xml +0 -3753
- data/test_files/bioworks_with_SHUFF_small.xml +0 -2503
- data/test_files/corrupted_900.srf +0 -0
- data/test_files/head_of_7MIX.srf +0 -0
- data/test_files/interact-opd1_mods_small-prot.xml +0 -304
- data/test_files/messups.fasta +0 -297
- data/test_files/opd1/000.my_answer.100lines.xml +0 -101
- data/test_files/opd1/000.tpp_1.2.3.first10.xml +0 -115
- data/test_files/opd1/000.tpp_2.9.2.first10.xml +0 -126
- data/test_files/opd1/000.v2.1.mzXML.timeIndex +0 -3748
- data/test_files/opd1/000_020-prot.png +0 -0
- data/test_files/opd1/000_020_3prots-prot.mod_initprob.xml +0 -62
- data/test_files/opd1/000_020_3prots-prot.xml +0 -62
- data/test_files/opd1/opd1_cat_inv_small-prot.xml +0 -139
- data/test_files/opd1/sequest.3.1.params +0 -77
- data/test_files/opd1/sequest.3.2.params +0 -62
- data/test_files/opd1/twenty_scans.mzXML +0 -418
- data/test_files/opd1/twenty_scans.v2.1.mzXML +0 -382
- data/test_files/opd1/twenty_scans_answ.lmat +0 -0
- data/test_files/opd1/twenty_scans_answ.lmata +0 -9
- data/test_files/opd1_020_beginning.RAW +0 -0
- data/test_files/opd1_2runs_2mods/data/020.mzData.xml +0 -683
- data/test_files/opd1_2runs_2mods/data/020.readw.mzXML +0 -382
- data/test_files/opd1_2runs_2mods/data/040.mzData.xml +0 -683
- data/test_files/opd1_2runs_2mods/data/040.readw.mzXML +0 -382
- data/test_files/opd1_2runs_2mods/data/README.txt +0 -6
- data/test_files/opd1_2runs_2mods/interact-opd1_mods__small.xml +0 -753
- data/test_files/orbitrap_mzData/000_cut.xml +0 -1920
- data/test_files/pepproph_small.xml +0 -4691
- data/test_files/phobius.small.noheader.txt +0 -50
- data/test_files/phobius.small.small.txt +0 -53
- data/test_files/s01_anC1_ld020mM.key.txt +0 -25
- data/test_files/s01_anC1_ld020mM.meth +0 -0
- data/test_files/small.fasta +0 -297
- data/test_files/small.sqt +0 -87
- data/test_files/smallraw.RAW +0 -0
- data/test_files/tf_bioworks2excel.bioXML +0 -14340
- data/test_files/tf_bioworks2excel.txt.actual +0 -1035
- data/test_files/toppred.small.out +0 -416
- data/test_files/toppred.xml.out +0 -318
- data/test_files/validator_hits_separate/bias_bioworks_small_HS.fasta +0 -7
- data/test_files/validator_hits_separate/bioworks_small_HS.xml +0 -5651
- data/test_files/yeast_gly_small-prot.xml +0 -265
- data/test_files/yeast_gly_small.1.0_1.0_1.0.parentTimes +0 -6
- data/test_files/yeast_gly_small.xml +0 -3807
- data/test_files/yeast_gly_small2.parentTimes +0 -6
data/lib/ms/msrun_index.rb
DELETED
|
@@ -1,108 +0,0 @@
|
|
|
1
|
-
require 'ms/scan'
|
|
2
|
-
require 'ms/parser'
|
|
3
|
-
|
|
4
|
-
class MS::MSRunIndex
|
|
5
|
-
# basename_noext is the base name of the file (with NO extensions)
|
|
6
|
-
attr_accessor :scans_by_num
|
|
7
|
-
attr_reader :basename_noext
|
|
8
|
-
|
|
9
|
-
# identifies and removes .mzXML .mzXML.timeIndex and .timeIndex
|
|
10
|
-
# otherwise, removes one extension and that's the filename_noext
|
|
11
|
-
# Also, removes any leading path
|
|
12
|
-
def basename_noext=(filename)
|
|
13
|
-
ext = File.extname(filename)
|
|
14
|
-
basename = File.basename(filename)
|
|
15
|
-
case ext
|
|
16
|
-
when '.mzXML'
|
|
17
|
-
@basename_noext = basename.gsub(/\.mzXML$/, "")
|
|
18
|
-
when '.timeIndex'
|
|
19
|
-
@basename_noext = basename.gsub(/\.timeIndex$/, "")
|
|
20
|
-
if File.extname(@basename_noext) == ".mzXML"
|
|
21
|
-
@basename_noext.gsub!(/\.mzXML$/, "")
|
|
22
|
-
end
|
|
23
|
-
else
|
|
24
|
-
@basename_noext = basename.gsub(/#{Regexp.escape(ext)}/, "")
|
|
25
|
-
end
|
|
26
|
-
end
|
|
27
|
-
|
|
28
|
-
# index_file has one row for each scan:
|
|
29
|
-
# ms_level scan_num time [prec_mz prec_inten]
|
|
30
|
-
# also consider getting this data directly from the mzXML file
|
|
31
|
-
# via the MS::MzXML::Parser.get_msrun_index command
|
|
32
|
-
def set_from_index_file(index_file)
|
|
33
|
-
self.basename_noext = index_file
|
|
34
|
-
@scans_by_num = []
|
|
35
|
-
if index_file
|
|
36
|
-
File.open(index_file).each do |line|
|
|
37
|
-
next if line !~ /\d/ || line =~ /^#/
|
|
38
|
-
line.chomp!
|
|
39
|
-
arr = line.split(" ")
|
|
40
|
-
scan = MS::Scan.new(arr[1].to_i, arr[0].to_i, arr[2].to_f)
|
|
41
|
-
if scan.ms_level > 1
|
|
42
|
-
scan.prec_mz = arr[3].to_f
|
|
43
|
-
scan.prec_inten = arr[4].to_f
|
|
44
|
-
end
|
|
45
|
-
@scans_by_num[scan.num] = scan
|
|
46
|
-
end
|
|
47
|
-
end
|
|
48
|
-
MS::Scan.add_parent_scan(@scans_by_num)
|
|
49
|
-
end
|
|
50
|
-
|
|
51
|
-
# Takes a .mzXML file or .timeIndex file (currently)
|
|
52
|
-
# and creates an index of scans from it
|
|
53
|
-
def initialize(file=nil)
|
|
54
|
-
@scans_by_num = []
|
|
55
|
-
if file
|
|
56
|
-
ext = File.extname(file)
|
|
57
|
-
case ext
|
|
58
|
-
when '.mzXML'
|
|
59
|
-
set_from_mzxml(file)
|
|
60
|
-
when '.timeIndex'
|
|
61
|
-
set_from_index_file(file)
|
|
62
|
-
else
|
|
63
|
-
raise ArgumentError, "#{self.class}.new doesn't recognize files of extension: #{ext}"
|
|
64
|
-
end
|
|
65
|
-
end
|
|
66
|
-
end
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
# returns a new
|
|
70
|
-
def set_from_mzxml(file)
|
|
71
|
-
self.basename_noext = file
|
|
72
|
-
@scans_by_num = MS::Parser.new(file, :scans_by_num).parse(file)
|
|
73
|
-
end
|
|
74
|
-
|
|
75
|
-
# writes the index to filename
|
|
76
|
-
# each line:
|
|
77
|
-
# ms_level scan_num time (if !ms_level=1) { prec_mz prec_intensity)
|
|
78
|
-
def to_index_file(filename)
|
|
79
|
-
strings = []
|
|
80
|
-
@scans_by_num.each do |scan|
|
|
81
|
-
if scan
|
|
82
|
-
strings << scan.to_index_file_string
|
|
83
|
-
end
|
|
84
|
-
end
|
|
85
|
-
File.open(filename, "w") do |fh|
|
|
86
|
-
fh.print strings.join("\n")
|
|
87
|
-
end
|
|
88
|
-
end
|
|
89
|
-
|
|
90
|
-
# returns an array of the times of the precursor scan's parent (not its own
|
|
91
|
-
# acquisition time). The parent scan index will also retrieve the time of
|
|
92
|
-
# the parent scan.
|
|
93
|
-
def parent_times_by_scan_num
|
|
94
|
-
by_num = []
|
|
95
|
-
parent_time = nil
|
|
96
|
-
@scans_by_num.each_with_index do |scan,i|
|
|
97
|
-
if scan.ms_level == 1
|
|
98
|
-
parent_time = scan.time
|
|
99
|
-
end
|
|
100
|
-
by_num[i] = parent_time
|
|
101
|
-
end
|
|
102
|
-
by_num
|
|
103
|
-
end
|
|
104
|
-
|
|
105
|
-
end
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
|
|
@@ -1,67 +0,0 @@
|
|
|
1
|
-
require 'ms/parser/mzdata/dom'
|
|
2
|
-
|
|
3
|
-
class MS::Parser::MzData::AXML < MS::Parser::MzData::DOM
|
|
4
|
-
def get_root_node_from_file(file)
|
|
5
|
-
::AXML.parse_file(file)
|
|
6
|
-
end
|
|
7
|
-
def get_root_node_from_io(io)
|
|
8
|
-
::AXML.parse(io)
|
|
9
|
-
end
|
|
10
|
-
end
|
|
11
|
-
|
|
12
|
-
class MS::Parser::MzData::AXML::LazyData < MS::Parser::MzData::AXML
|
|
13
|
-
def get_root_node_from_string(string)
|
|
14
|
-
::AXML::LazyData.parse(string)
|
|
15
|
-
end
|
|
16
|
-
def get_root_node_from_file(file)
|
|
17
|
-
::AXML::LazyData.parse_file(file)
|
|
18
|
-
end
|
|
19
|
-
def get_root_node_from_io(io)
|
|
20
|
-
::AXML::LazyData.parse(io)
|
|
21
|
-
end
|
|
22
|
-
end
|
|
23
|
-
|
|
24
|
-
class AXML::LazyData < AXML
|
|
25
|
-
# Returns the root node (as Element) or nodes (as Array)
|
|
26
|
-
def self.parse(stream)
|
|
27
|
-
parser = ::AXML::XMLParser::LazyData.new
|
|
28
|
-
parser.parse(stream)
|
|
29
|
-
parser.root
|
|
30
|
-
end
|
|
31
|
-
end
|
|
32
|
-
|
|
33
|
-
# This parser stores information about where the data (peaks) information is
|
|
34
|
-
# in the file
|
|
35
|
-
# The content of the data node is an array where the first member is the
|
|
36
|
-
# start index and the last member is the number of bytes. All other members
|
|
37
|
-
# should be ignored.
|
|
38
|
-
class AXML::XMLParser::LazyData < ::AXML::XMLParser
|
|
39
|
-
|
|
40
|
-
def startElement(name, attributes)
|
|
41
|
-
text =
|
|
42
|
-
if name == 'data' ; []
|
|
43
|
-
else ; ''
|
|
44
|
-
end
|
|
45
|
-
new_el = ::AXML::El.new(@cur, name, attributes, text, [])
|
|
46
|
-
# add the new node to the previous parent node
|
|
47
|
-
@cur.add_node(new_el)
|
|
48
|
-
# notice the change in @cur node
|
|
49
|
-
@cur = new_el
|
|
50
|
-
end
|
|
51
|
-
|
|
52
|
-
def character(data)
|
|
53
|
-
if @cur.text.is_a? Array
|
|
54
|
-
@cur.text << byteIndex
|
|
55
|
-
else
|
|
56
|
-
@cur.text << data
|
|
57
|
-
end
|
|
58
|
-
end
|
|
59
|
-
|
|
60
|
-
def endElement(name)
|
|
61
|
-
if @cur.text.is_a? Array
|
|
62
|
-
@cur.text << (byteIndex - @cur.text.first)
|
|
63
|
-
end
|
|
64
|
-
@cur = @cur.parent
|
|
65
|
-
end
|
|
66
|
-
|
|
67
|
-
end
|
data/lib/ms/parser/mzdata/dom.rb
DELETED
|
@@ -1,175 +0,0 @@
|
|
|
1
|
-
require 'xml_style_parser'
|
|
2
|
-
require 'ms/spectrum'
|
|
3
|
-
require 'ms/scan'
|
|
4
|
-
|
|
5
|
-
module MS::Parser::MzData ; end
|
|
6
|
-
|
|
7
|
-
class MS::Parser::MzData::DOM
|
|
8
|
-
include XMLStyleParser
|
|
9
|
-
include MS::Parser::MzData
|
|
10
|
-
|
|
11
|
-
def initialize(parse_type=:msrun, version='1.0')
|
|
12
|
-
@method = parse_type
|
|
13
|
-
@version = version
|
|
14
|
-
end
|
|
15
|
-
|
|
16
|
-
# true if there is a node <dataProcessing><software><name>Bioworks Browser</...>
|
|
17
|
-
# otherwise false
|
|
18
|
-
def is_bioworks33?(description_node)
|
|
19
|
-
begin
|
|
20
|
-
software_node = description_node.find_first('child::dataProcessing').find_first('child::software')
|
|
21
|
-
name = software_node.find_first('child::name').content
|
|
22
|
-
version = software_node.find_first('child::version').content
|
|
23
|
-
((name == 'Bioworks Browser') and (version == '3.3'))
|
|
24
|
-
rescue
|
|
25
|
-
false
|
|
26
|
-
end
|
|
27
|
-
end
|
|
28
|
-
|
|
29
|
-
# OPTIONS:
|
|
30
|
-
# :msrun => MSRun # use this object instead of creating one
|
|
31
|
-
def msrun(file, opts={})
|
|
32
|
-
msrun_obj =
|
|
33
|
-
if x = opts[:msrun]
|
|
34
|
-
msrun_obj = x
|
|
35
|
-
else
|
|
36
|
-
MS::MSRun.new
|
|
37
|
-
end
|
|
38
|
-
# should ensure that parsing is not counting spaces...
|
|
39
|
-
|
|
40
|
-
# a string we'd parse like this:
|
|
41
|
-
# doc = XML::Parser.string(st).parse
|
|
42
|
-
|
|
43
|
-
# WE NEED TO GET scan_count, start_time and end_time!!!!
|
|
44
|
-
id_to_scan_hash = {}
|
|
45
|
-
|
|
46
|
-
# 0 1 2 3 4 5 6
|
|
47
|
-
# %w(num msLevel retentionTime startMz endMz precursor spectrum)
|
|
48
|
-
|
|
49
|
-
io =
|
|
50
|
-
if file.is_a? String
|
|
51
|
-
filename = file
|
|
52
|
-
File.open(file)
|
|
53
|
-
else
|
|
54
|
-
file
|
|
55
|
-
end
|
|
56
|
-
root = get_root_node_from_io(io)
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
description = root.find_first('child::description')
|
|
60
|
-
bioworks33 = is_bioworks33?(description)
|
|
61
|
-
spectrum_list = description.next
|
|
62
|
-
|
|
63
|
-
scans = []
|
|
64
|
-
|
|
65
|
-
# bioworks 33 gives incorrect scan count
|
|
66
|
-
stated_num_scans = spectrum_list['count'].to_i
|
|
67
|
-
|
|
68
|
-
# if I move from node to node, it means I've checked that it's a sequence
|
|
69
|
-
# and that the elements are req'd
|
|
70
|
-
if spectrum_list.child?
|
|
71
|
-
spectrum_n = spectrum_list.child
|
|
72
|
-
loop do
|
|
73
|
-
scan = MS::Scan.new(9)
|
|
74
|
-
id = spectrum_n["id"].to_i
|
|
75
|
-
id_to_scan_hash[id] = scan
|
|
76
|
-
spec_desc_n = spectrum_n.child # required in sequence
|
|
77
|
-
spec_settings_n = spec_desc_n.child # required in sequence
|
|
78
|
-
if acq_n = spec_settings_n.find_first('descendant::acquisition')
|
|
79
|
-
scan[0] = acq_n['acqNumber'].to_i
|
|
80
|
-
else
|
|
81
|
-
scan[0] = id
|
|
82
|
-
end
|
|
83
|
-
spec_inst_n = spec_settings_n.find_first('child::spectrumInstrument')
|
|
84
|
-
scan[1] = spec_inst_n['msLevel'].to_i
|
|
85
|
-
|
|
86
|
-
# we could use a scan_count, but in bioworks 33, we can't trust the
|
|
87
|
-
# scan count! So, we just collect them
|
|
88
|
-
scans << scan
|
|
89
|
-
|
|
90
|
-
scan[3] = spec_inst_n['mzRangeStart'].to_f
|
|
91
|
-
scan[4] = spec_inst_n['mzRangeStop'].to_f
|
|
92
|
-
spec_inst_n.find('child::cvParam').each do |cv_param|
|
|
93
|
-
if cv_param['name'] == 'TimeInMinutes'
|
|
94
|
-
scan[2] = cv_param['value'].to_f * 60 #convert to seconds
|
|
95
|
-
end
|
|
96
|
-
end
|
|
97
|
-
if scan[1] > 1 # precursormz info
|
|
98
|
-
prec_list_n = spec_settings_n.next
|
|
99
|
-
raise RuntimeError, "MSRun objects can only accept 1 precursor" if prec_list_n['count'] != '1'
|
|
100
|
-
prec_n = prec_list_n.find_first('child::precursor')
|
|
101
|
-
# %w(mz inten parent ms_level parent charge_states)
|
|
102
|
-
prec = MS::Precursor.new
|
|
103
|
-
unless bioworks33 # bioworks33 points to the wrong scan!!!
|
|
104
|
-
prec[2] = id_to_scan_hash[prec_n['spectrumRef'].to_i]
|
|
105
|
-
end
|
|
106
|
-
# we're not keeping track of this guy anymore
|
|
107
|
-
# prec[3] = prec_n['msLevel'].to_i
|
|
108
|
-
charges = []
|
|
109
|
-
prec_n.find('descendant::cvParam').each do |cv_param_n|
|
|
110
|
-
case cv_param_n['name']
|
|
111
|
-
when 'MassToChargeRatio'
|
|
112
|
-
prec[0] = cv_param_n['value'].to_f
|
|
113
|
-
# find the prec intensity
|
|
114
|
-
unless bioworks33
|
|
115
|
-
prec[1] = prec[2].spectrum.intensity_at_mz(prec[0])
|
|
116
|
-
end
|
|
117
|
-
when 'ChargeState'
|
|
118
|
-
charges << cv_param_n['value'].to_i
|
|
119
|
-
end
|
|
120
|
-
end
|
|
121
|
-
prec[3] = charges
|
|
122
|
-
scan[5] = prec
|
|
123
|
-
else # no precursors
|
|
124
|
-
scan[5] = nil
|
|
125
|
-
end
|
|
126
|
-
# here's the one line way of doing it, but it's probably more clear in
|
|
127
|
-
# the loop
|
|
128
|
-
#while ((mz_array_bin_n = spec_desc_n.next).name != 'mzArrayBinary') do
|
|
129
|
-
unless opts[:lazy] == :no_spectra
|
|
130
|
-
mz_array_bin_n = nil
|
|
131
|
-
loop do
|
|
132
|
-
mz_array_bin_n = spec_desc_n.next
|
|
133
|
-
break if mz_array_bin_n.name == 'mzArrayBinary'
|
|
134
|
-
end
|
|
135
|
-
mz_data_n = mz_array_bin_n.child
|
|
136
|
-
inten_array_bin_n = mz_array_bin_n.next
|
|
137
|
-
inten_data_n = inten_array_bin_n.child
|
|
138
|
-
case opts[:lazy]
|
|
139
|
-
when :string
|
|
140
|
-
scan[6] = MS::Spectrum::LazyString.from_base64_pair(mz_data_n.content, mz_data_n['precision'].to_i, ((mz_data_n['endian']=='little') ? false : true), inten_data_n.content, inten_data_n['precision'].to_i, ((inten_data_n['endian']=='little') ? false : true) )
|
|
141
|
-
when :io
|
|
142
|
-
mz_data_n_content = mz_data_n.content
|
|
143
|
-
i_data_n_content = inten_data_n.content
|
|
144
|
-
scan[6] = MS::Spectrum::LazyIO.new(io, mz_data_n_content.first, mz_data_n_content.last, mz_data_n['precision'].to_i, ((mz_data_n['endian']=='little') ? false : true), i_data_n_content.first, i_data_n_content.last, inten_data_n['precision'].to_i, ((inten_data_n['endian']=='little') ? false : true))
|
|
145
|
-
when :not
|
|
146
|
-
mz = MS::Spectrum.base64_to_array(mz_data_n.content, mz_data_n['precision'].to_i, ((mz_data_n['endian']=='little') ? false : true))
|
|
147
|
-
inten = MS::Spectrum.base64_to_array(inten_data_n.content, inten_data_n['precision'].to_i, ((inten_data_n['endian']=='little') ? false : true))
|
|
148
|
-
scan[6] = MS::Spectrum.new(mz, inten)
|
|
149
|
-
end
|
|
150
|
-
end
|
|
151
|
-
|
|
152
|
-
# set up the next loop
|
|
153
|
-
break unless spectrum_n = spectrum_n.next
|
|
154
|
-
end
|
|
155
|
-
end
|
|
156
|
-
if bioworks33
|
|
157
|
-
MS::MSRun.add_parent_scan(scans, ((opts[:lazy] == :not) ? true : false))
|
|
158
|
-
end
|
|
159
|
-
msrun_obj.scans = scans
|
|
160
|
-
msrun_obj.scan_count = scans.size
|
|
161
|
-
unless bioworks33 # we know the scan count is off here
|
|
162
|
-
if msrun_obj.scan_count != stated_num_scans
|
|
163
|
-
warn "num collected scans (#{scans.size}) does not agree with stated num scans (#{stated_num_scans})!"
|
|
164
|
-
end
|
|
165
|
-
end
|
|
166
|
-
msrun_obj.start_time = msrun_obj.scans.first.time
|
|
167
|
-
msrun_obj.end_time = msrun_obj.scans.last.time
|
|
168
|
-
|
|
169
|
-
io.close if filename
|
|
170
|
-
end
|
|
171
|
-
|
|
172
|
-
end
|
|
173
|
-
|
|
174
|
-
|
|
175
|
-
|
data/lib/ms/parser/mzdata.rb
DELETED
|
@@ -1,31 +0,0 @@
|
|
|
1
|
-
require 'ms/msrun'
|
|
2
|
-
|
|
3
|
-
module MS; end
|
|
4
|
-
|
|
5
|
-
module MS::Parser::MzData
|
|
6
|
-
Base_dir_for_parsers = 'ms/parser/mzdata'
|
|
7
|
-
|
|
8
|
-
# inherits XMLStyleParser and version
|
|
9
|
-
include MS::Parser
|
|
10
|
-
include XMLStyleParser
|
|
11
|
-
|
|
12
|
-
# returns a specific parser MS::Parser::MzXML::#{ParserType}
|
|
13
|
-
# based on choose_parser from xml_style_parser
|
|
14
|
-
def self.new(parse_type=:msrun, version='1.05', opts={})
|
|
15
|
-
special_subclass =
|
|
16
|
-
if opts[:lazy] == :io
|
|
17
|
-
'LazyData'
|
|
18
|
-
else ; nil
|
|
19
|
-
end
|
|
20
|
-
|
|
21
|
-
@version = version
|
|
22
|
-
@method = parse_type
|
|
23
|
-
#p self.methods.grep /choose_parser/
|
|
24
|
-
XMLStyleParser.require_parse_files(Base_dir_for_parsers)
|
|
25
|
-
parser_class = XMLStyleParser.choose_parser(self, parse_type, special_subclass)
|
|
26
|
-
parser = parser_class.new(parse_type, version)
|
|
27
|
-
end
|
|
28
|
-
|
|
29
|
-
end
|
|
30
|
-
|
|
31
|
-
|
data/lib/ms/parser/mzxml/axml.rb
DELETED
|
@@ -1,70 +0,0 @@
|
|
|
1
|
-
require 'ms/parser/mzxml/dom'
|
|
2
|
-
|
|
3
|
-
class MS::Parser::MzXML::AXML < MS::Parser::MzXML::DOM
|
|
4
|
-
def get_root_node_from_string(string)
|
|
5
|
-
::AXML.parse(string)
|
|
6
|
-
end
|
|
7
|
-
def get_root_node_from_file(file)
|
|
8
|
-
::AXML.parse_file(file)
|
|
9
|
-
end
|
|
10
|
-
def get_root_node_from_io(io)
|
|
11
|
-
::AXML.parse(io)
|
|
12
|
-
end
|
|
13
|
-
end
|
|
14
|
-
|
|
15
|
-
class MS::Parser::MzXML::AXML::LazyPeaks < MS::Parser::MzXML::AXML
|
|
16
|
-
def get_root_node_from_string(string)
|
|
17
|
-
::AXML::LazyPeaks.parse(string)
|
|
18
|
-
end
|
|
19
|
-
def get_root_node_from_file(file)
|
|
20
|
-
::AXML::LazyPeaks.parse_file(file)
|
|
21
|
-
end
|
|
22
|
-
def get_root_node_from_io(io)
|
|
23
|
-
::AXML::LazyPeaks.parse(io)
|
|
24
|
-
end
|
|
25
|
-
end
|
|
26
|
-
|
|
27
|
-
class AXML::LazyPeaks < AXML
|
|
28
|
-
# Returns the root node (as Element) or nodes (as Array)
|
|
29
|
-
def self.parse(stream)
|
|
30
|
-
parser = ::AXML::XMLParser::LazyPeaks.new
|
|
31
|
-
parser.parse(stream)
|
|
32
|
-
parser.root
|
|
33
|
-
end
|
|
34
|
-
end
|
|
35
|
-
|
|
36
|
-
# This parser stores information about where the peaks information is in the
|
|
37
|
-
# file
|
|
38
|
-
# The content of the peaks node is an array where the first member is the
|
|
39
|
-
# start index and the last member is the number of bytes. All other members
|
|
40
|
-
# should be ignored.
|
|
41
|
-
class AXML::XMLParser::LazyPeaks < ::AXML::XMLParser
|
|
42
|
-
|
|
43
|
-
def startElement(name, attributes)
|
|
44
|
-
text =
|
|
45
|
-
if name == 'peaks' ; []
|
|
46
|
-
else ; ''
|
|
47
|
-
end
|
|
48
|
-
new_el = ::AXML::El.new(@cur, name, attributes, text, [])
|
|
49
|
-
# add the new node to the previous parent node
|
|
50
|
-
@cur.add_node(new_el)
|
|
51
|
-
# notice the change in @cur node
|
|
52
|
-
@cur = new_el
|
|
53
|
-
end
|
|
54
|
-
|
|
55
|
-
def character(data)
|
|
56
|
-
if @cur.text.is_a? Array
|
|
57
|
-
@cur.text << byteIndex
|
|
58
|
-
else
|
|
59
|
-
@cur.text << data
|
|
60
|
-
end
|
|
61
|
-
end
|
|
62
|
-
|
|
63
|
-
def endElement(name)
|
|
64
|
-
if @cur.text.is_a? Array
|
|
65
|
-
@cur.text << (byteIndex - @cur.text.first)
|
|
66
|
-
end
|
|
67
|
-
@cur = @cur.parent
|
|
68
|
-
end
|
|
69
|
-
|
|
70
|
-
end
|
data/lib/ms/parser/mzxml/dom.rb
DELETED
|
@@ -1,182 +0,0 @@
|
|
|
1
|
-
require 'xml_style_parser'
|
|
2
|
-
require 'ms/spectrum'
|
|
3
|
-
require 'ms/scan'
|
|
4
|
-
require 'ms/parser/mzxml'
|
|
5
|
-
require 'tempfile'
|
|
6
|
-
|
|
7
|
-
|
|
8
|
-
class MS::Parser::MzXML::DOM
|
|
9
|
-
include XMLStyleParser
|
|
10
|
-
include MS::Parser::MzXML
|
|
11
|
-
|
|
12
|
-
NetworkOrder = true
|
|
13
|
-
|
|
14
|
-
#@@scan_atts = %w(num msLevel retentionTime startMz endMz precursor spectrum)
|
|
15
|
-
|
|
16
|
-
def initialize(parse_type=:msrun, version='1.0')
|
|
17
|
-
@method = parse_type
|
|
18
|
-
@version = version
|
|
19
|
-
end
|
|
20
|
-
|
|
21
|
-
def new_scan_from_hash(node)
|
|
22
|
-
scan = MS::Scan.new # array class creates one with 9 positions
|
|
23
|
-
scan[0] = node['num'].to_i
|
|
24
|
-
scan[1] = node['msLevel'].to_i
|
|
25
|
-
if x = node['retentionTime']
|
|
26
|
-
scan[2] = x[2...-1].to_f
|
|
27
|
-
end
|
|
28
|
-
if x = node['startMz']
|
|
29
|
-
scan[3] = x.to_f
|
|
30
|
-
scan[4] = node['endMz'].to_f
|
|
31
|
-
end
|
|
32
|
-
scan
|
|
33
|
-
end
|
|
34
|
-
|
|
35
|
-
# assumes that node contains scans and checks any scan nodes for children
|
|
36
|
-
def add_scan_nodes(nodes, scans, scn_index, scans_by_num, lazy, io)
|
|
37
|
-
nodes.each do |scan_n|
|
|
38
|
-
scan = create_scan(scan_n, scans_by_num, lazy, io)
|
|
39
|
-
scans[scn_index] = scan
|
|
40
|
-
scans_by_num[scan[0]] = scan
|
|
41
|
-
scn_index += 1
|
|
42
|
-
if @version > '1.0'
|
|
43
|
-
new_nodes = scan_n.find('child::scan')
|
|
44
|
-
if new_nodes.size > 0
|
|
45
|
-
scn_index = add_scan_nodes(new_nodes, scans, scn_index, scans_by_num, lazy, io)
|
|
46
|
-
end
|
|
47
|
-
end
|
|
48
|
-
end
|
|
49
|
-
scn_index
|
|
50
|
-
end
|
|
51
|
-
|
|
52
|
-
# takes a scan node and creates a scan object
|
|
53
|
-
# the parent scan is the one directly above it in mslevel
|
|
54
|
-
# lazy must be a symbol from MS::MSRun.new
|
|
55
|
-
def create_scan(scan_n, scans_by_num, lazy, io=nil)
|
|
56
|
-
scan = new_scan_from_hash(scan_n)
|
|
57
|
-
prec = nil
|
|
58
|
-
scan_n.each do |node|
|
|
59
|
-
case node.name
|
|
60
|
-
when 'precursorMz'
|
|
61
|
-
# should be able to do this!!!
|
|
62
|
-
#scan[5] = scan_n.find('child::precursorMz').map do |prec_n|
|
|
63
|
-
raise RuntimeError, "the msrun object can only handle one precursor!" unless prec.nil?
|
|
64
|
-
prec = MS::Precursor.new
|
|
65
|
-
prec[1] = node['precursorIntensity'].to_f
|
|
66
|
-
prec[0] = node.content.to_f
|
|
67
|
-
if x = node['precursorScanNum']
|
|
68
|
-
prec[2] = scans_by_num[x.to_i]
|
|
69
|
-
end
|
|
70
|
-
when 'peaks'
|
|
71
|
-
case lazy
|
|
72
|
-
when :no_spectra
|
|
73
|
-
next
|
|
74
|
-
when :string
|
|
75
|
-
scan[6] = MS::Spectrum::LazyString.from_base64_peaks(node.content, node['precision'].to_i)
|
|
76
|
-
when :io
|
|
77
|
-
# assumes that parsing was done with a LazyPeaks parser!
|
|
78
|
-
nc = node.content
|
|
79
|
-
scan[6] = MS::Spectrum::LazyIO.new(io, nc.first, nc.last, node['precision'].to_i, MS::Parser::MzXML::DOM::NetworkOrder)
|
|
80
|
-
when :not
|
|
81
|
-
# SHOULD be able to do this!!
|
|
82
|
-
#peaks_n = scan_n.find_first('child::peaks')
|
|
83
|
-
scan[6] = MS::Spectrum.from_base64_peaks(node.content, node['precision'].to_i)
|
|
84
|
-
end
|
|
85
|
-
end
|
|
86
|
-
end
|
|
87
|
-
scan[5] = prec
|
|
88
|
-
scan
|
|
89
|
-
end
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
# returns an array of msrun objects
|
|
93
|
-
def msruns(file)
|
|
94
|
-
raise NotImplementedError
|
|
95
|
-
end
|
|
96
|
-
|
|
97
|
-
# right now cannot parse multiple runs out of an mzXML version 2 file since
|
|
98
|
-
# this is built around a single run per file
|
|
99
|
-
# OPTIONS:
|
|
100
|
-
# :msrun => (an MSRun object) # use this object instead of creating one
|
|
101
|
-
# :lazy => [See MS::MSRun for documentation]
|
|
102
|
-
def msrun(file, opts={})
|
|
103
|
-
#unless opts.key?(:spectra)
|
|
104
|
-
# opts[:spectra] = true
|
|
105
|
-
#end
|
|
106
|
-
|
|
107
|
-
msrun_obj =
|
|
108
|
-
if x = opts[:msrun]
|
|
109
|
-
msrun_obj = x
|
|
110
|
-
else
|
|
111
|
-
MS::MSRun.new
|
|
112
|
-
end
|
|
113
|
-
|
|
114
|
-
io =
|
|
115
|
-
if file.is_a? String # a filename
|
|
116
|
-
filename = file
|
|
117
|
-
File.open(file)
|
|
118
|
-
else
|
|
119
|
-
file
|
|
120
|
-
end
|
|
121
|
-
|
|
122
|
-
root = get_root_node_from_io(io)
|
|
123
|
-
|
|
124
|
-
if filename
|
|
125
|
-
io.close # can close now
|
|
126
|
-
end
|
|
127
|
-
|
|
128
|
-
# right now we are only finding the first msRun (probably a rare case of
|
|
129
|
-
# multiple runs in an mzXML file...)
|
|
130
|
-
msrun_n =
|
|
131
|
-
if @version >= '2.0'
|
|
132
|
-
kids = root.children.select {|v| v.name == 'msRun' }
|
|
133
|
-
raise(NotImplementedError, "one msrun per doc right now" ) if kids.size > 1
|
|
134
|
-
kids.first
|
|
135
|
-
else
|
|
136
|
-
root
|
|
137
|
-
end
|
|
138
|
-
if msrun_n.name != 'msRun'
|
|
139
|
-
raise RuntimeError, "extra node slipped in somehow"
|
|
140
|
-
end
|
|
141
|
-
|
|
142
|
-
## HEADER
|
|
143
|
-
scan_count = msrun_n['scanCount'].to_i
|
|
144
|
-
msrun_obj.scan_count = scan_count
|
|
145
|
-
scans_by_num = Array.new(scan_count + 1)
|
|
146
|
-
|
|
147
|
-
## SPECTRUM
|
|
148
|
-
parent = nil
|
|
149
|
-
scans = Array.new( scan_count )
|
|
150
|
-
scn_index = 0
|
|
151
|
-
|
|
152
|
-
# we should be able to do this, but it's not working!!!
|
|
153
|
-
#scan_n = msrun_n.find_first('scan')
|
|
154
|
-
#while (scn_index < scan_count)
|
|
155
|
-
lazy = opts[:lazy]
|
|
156
|
-
|
|
157
|
-
if @version >= '3.0'
|
|
158
|
-
warn '[version 3.0 parsing may fail if > 1 peak list per scan]'
|
|
159
|
-
# note that mzXML version 3.0 *can* have more than one peak...
|
|
160
|
-
# I'm not sure how to deal with that since I have one spectrum/scan
|
|
161
|
-
end
|
|
162
|
-
|
|
163
|
-
scan_nodes = msrun_n.find('child::scan')
|
|
164
|
-
add_scan_nodes(scan_nodes, scans, scn_index, scans_by_num, lazy, io)
|
|
165
|
-
|
|
166
|
-
## update the scan's parents
|
|
167
|
-
MS::MSRun.add_parent_scan(scans)
|
|
168
|
-
|
|
169
|
-
# note that startTime and endTime are optional AND in >2.2 are dateTime
|
|
170
|
-
# instead of duration types!, so we will just use scan times...
|
|
171
|
-
# Also, note that startTime and endTime are BROKEN on readw -> mzXML 2.0
|
|
172
|
-
# export. They give the start and end time in seconds, but they are
|
|
173
|
-
# really minutes. All the more reason to use the first and last scans!
|
|
174
|
-
msrun_obj.start_time = scans.first.time
|
|
175
|
-
msrun_obj.end_time = scans.last.time
|
|
176
|
-
|
|
177
|
-
msrun_obj.scans = scans
|
|
178
|
-
|
|
179
|
-
end
|
|
180
|
-
end
|
|
181
|
-
|
|
182
|
-
|