RubyGems - mspire - Versions diffs - 0.4.9 → 0.5.0 - Mend

mspire 0.4.9 → 0.5.0

Files changed (255) hide show

data/README +27 -17
data/changelog.txt +31 -62
data/lib/ms/calc.rb +32 -0
data/lib/ms/data/interleaved.rb +60 -0
data/lib/ms/data/lazy_io.rb +73 -0
data/lib/ms/data/lazy_string.rb +15 -0
data/lib/ms/data/simple.rb +59 -0
data/lib/ms/data/transposed.rb +41 -0
data/lib/ms/data.rb +57 -0
data/lib/ms/format/format_error.rb +12 -0
data/lib/ms/spectrum.rb +25 -384
data/lib/ms/support/binary_search.rb +126 -0
data/lib/ms.rb +10 -10
metadata +38 -350
data/INSTALL +0 -58
data/README.rdoc +0 -18
data/Rakefile +0 -330
data/bin/aafreqs.rb +0 -23
data/bin/bioworks2excel.rb +0 -14
data/bin/bioworks_to_pepxml.rb +0 -148
data/bin/bioworks_to_pepxml_gui.rb +0 -225
data/bin/fasta_shaker.rb +0 -5
data/bin/filter_and_validate.rb +0 -5
data/bin/gi2annot.rb +0 -14
data/bin/id_class_anal.rb +0 -112
data/bin/id_precision.rb +0 -172
data/bin/ms_to_lmat.rb +0 -67
data/bin/pepproph_filter.rb +0 -16
data/bin/prob_validate.rb +0 -6
data/bin/protein_summary.rb +0 -6
data/bin/protxml2prots_peps.rb +0 -32
data/bin/raw_to_mzXML.rb +0 -55
data/bin/run_percolator.rb +0 -122
data/bin/sqt_group.rb +0 -26
data/bin/srf_group.rb +0 -27
data/bin/srf_to_sqt.rb +0 -40
data/lib/align/chams.rb +0 -78
data/lib/align.rb +0 -154
data/lib/archive/targz.rb +0 -94
data/lib/bsearch.rb +0 -120
data/lib/core_extensions.rb +0 -16
data/lib/fasta.rb +0 -626
data/lib/gi.rb +0 -124
data/lib/group_by.rb +0 -10
data/lib/index_by.rb +0 -11
data/lib/merge_deep.rb +0 -21
data/lib/ms/converter/mzxml.rb +0 -77
data/lib/ms/gradient_program.rb +0 -170
data/lib/ms/msrun.rb +0 -244
data/lib/ms/msrun_index.rb +0 -108
data/lib/ms/parser/mzdata/axml.rb +0 -67
data/lib/ms/parser/mzdata/dom.rb +0 -175
data/lib/ms/parser/mzdata/libxml.rb +0 -7
data/lib/ms/parser/mzdata.rb +0 -31
data/lib/ms/parser/mzxml/axml.rb +0 -70
data/lib/ms/parser/mzxml/dom.rb +0 -182
data/lib/ms/parser/mzxml/hpricot.rb +0 -253
data/lib/ms/parser/mzxml/libxml.rb +0 -19
data/lib/ms/parser/mzxml/regexp.rb +0 -122
data/lib/ms/parser/mzxml/rexml.rb +0 -72
data/lib/ms/parser/mzxml/xmlparser.rb +0 -248
data/lib/ms/parser/mzxml.rb +0 -282
data/lib/ms/parser.rb +0 -108
data/lib/ms/precursor.rb +0 -25
data/lib/ms/scan.rb +0 -81
data/lib/mspire.rb +0 -4
data/lib/pi_zero.rb +0 -244
data/lib/qvalue.rb +0 -161
data/lib/roc.rb +0 -187
data/lib/sample_enzyme.rb +0 -160
data/lib/scan_i.rb +0 -21
data/lib/spec_id/aa_freqs.rb +0 -170
data/lib/spec_id/bioworks.rb +0 -497
data/lib/spec_id/digestor.rb +0 -138
data/lib/spec_id/mass.rb +0 -179
data/lib/spec_id/parser/proph.rb +0 -335
data/lib/spec_id/precision/filter/cmdline.rb +0 -218
data/lib/spec_id/precision/filter/interactive.rb +0 -134
data/lib/spec_id/precision/filter/output.rb +0 -148
data/lib/spec_id/precision/filter.rb +0 -637
data/lib/spec_id/precision/output.rb +0 -60
data/lib/spec_id/precision/prob/cmdline.rb +0 -160
data/lib/spec_id/precision/prob/output.rb +0 -94
data/lib/spec_id/precision/prob.rb +0 -249
data/lib/spec_id/proph/pep_summary.rb +0 -104
data/lib/spec_id/proph/prot_summary.rb +0 -484
data/lib/spec_id/proph.rb +0 -4
data/lib/spec_id/protein_summary.rb +0 -489
data/lib/spec_id/sequest/params.rb +0 -316
data/lib/spec_id/sequest/pepxml.rb +0 -1458
data/lib/spec_id/sequest.rb +0 -33
data/lib/spec_id/sqt.rb +0 -349
data/lib/spec_id/srf.rb +0 -973
data/lib/spec_id.rb +0 -778
data/lib/spec_id_xml.rb +0 -99
data/lib/transmem/phobius.rb +0 -147
data/lib/transmem/toppred.rb +0 -368
data/lib/transmem.rb +0 -157
data/lib/validator/aa.rb +0 -48
data/lib/validator/aa_est.rb +0 -112
data/lib/validator/background.rb +0 -77
data/lib/validator/bias.rb +0 -95
data/lib/validator/cmdline.rb +0 -431
data/lib/validator/decoy.rb +0 -107
data/lib/validator/digestion_based.rb +0 -70
data/lib/validator/probability.rb +0 -51
data/lib/validator/prot_from_pep.rb +0 -234
data/lib/validator/q_value.rb +0 -32
data/lib/validator/transmem.rb +0 -272
data/lib/validator/true_pos.rb +0 -46
data/lib/validator.rb +0 -197
data/lib/xml.rb +0 -38
data/lib/xml_style_parser.rb +0 -119
data/lib/xmlparser_wrapper.rb +0 -19
data/release_notes.txt +0 -2
data/script/compile_and_plot_smriti_final.rb +0 -97
data/script/create_little_pepxml.rb +0 -61
data/script/degenerate_peptides.rb +0 -47
data/script/estimate_fpr_by_cysteine.rb +0 -226
data/script/extract_gradient_programs.rb +0 -56
data/script/find_cysteine_background.rb +0 -137
data/script/genuine_tps_and_probs.rb +0 -136
data/script/get_apex_values_rexml.rb +0 -44
data/script/histogram_probs.rb +0 -61
data/script/mascot_fix_pepxml.rb +0 -123
data/script/msvis.rb +0 -42
data/script/mzXML2timeIndex.rb +0 -25
data/script/peps_per_bin.rb +0 -67
data/script/prep_dir.rb +0 -121
data/script/simple_protein_digestion.rb +0 -27
data/script/smriti_final_analysis.rb +0 -103
data/script/sqt_to_meta.rb +0 -24
data/script/top_hit_per_scan.rb +0 -67
data/script/toppred_to_yaml.rb +0 -47
data/script/tpp_installer.rb +0 -249
data/specs/align_spec.rb +0 -79
data/specs/bin/bioworks_to_pepxml_spec.rb +0 -79
data/specs/bin/fasta_shaker_spec.rb +0 -259
data/specs/bin/filter_and_validate__multiple_vals_helper.yaml +0 -199
data/specs/bin/filter_and_validate_spec.rb +0 -180
data/specs/bin/ms_to_lmat_spec.rb +0 -34
data/specs/bin/prob_validate_spec.rb +0 -86
data/specs/bin/protein_summary_spec.rb +0 -14
data/specs/fasta_spec.rb +0 -354
data/specs/gi_spec.rb +0 -22
data/specs/load_bin_path.rb +0 -7
data/specs/merge_deep_spec.rb +0 -13
data/specs/ms/gradient_program_spec.rb +0 -77
data/specs/ms/msrun_spec.rb +0 -498
data/specs/ms/parser_spec.rb +0 -92
data/specs/ms/spectrum_spec.rb +0 -87
data/specs/pi_zero_spec.rb +0 -115
data/specs/qvalue_spec.rb +0 -39
data/specs/roc_spec.rb +0 -251
data/specs/rspec_autotest.rb +0 -149
data/specs/sample_enzyme_spec.rb +0 -126
data/specs/spec_helper.rb +0 -135
data/specs/spec_id/aa_freqs_spec.rb +0 -52
data/specs/spec_id/bioworks_spec.rb +0 -148
data/specs/spec_id/digestor_spec.rb +0 -75
data/specs/spec_id/precision/filter/cmdline_spec.rb +0 -20
data/specs/spec_id/precision/filter/output_spec.rb +0 -31
data/specs/spec_id/precision/filter_spec.rb +0 -246
data/specs/spec_id/precision/prob_spec.rb +0 -44
data/specs/spec_id/precision/prob_spec_helper.rb +0 -0
data/specs/spec_id/proph/pep_summary_spec.rb +0 -98
data/specs/spec_id/proph/prot_summary_spec.rb +0 -128
data/specs/spec_id/protein_summary_spec.rb +0 -189
data/specs/spec_id/sequest/params_spec.rb +0 -68
data/specs/spec_id/sequest/pepxml_spec.rb +0 -374
data/specs/spec_id/sequest_spec.rb +0 -38
data/specs/spec_id/sqt_spec.rb +0 -246
data/specs/spec_id/srf_spec.rb +0 -172
data/specs/spec_id/srf_spec_helper.rb +0 -139
data/specs/spec_id_helper.rb +0 -33
data/specs/spec_id_spec.rb +0 -366
data/specs/spec_id_xml_spec.rb +0 -33
data/specs/transmem/phobius_spec.rb +0 -425
data/specs/transmem/toppred_spec.rb +0 -298
data/specs/transmem_spec.rb +0 -60
data/specs/transmem_spec_shared.rb +0 -64
data/specs/validator/aa_est_spec.rb +0 -66
data/specs/validator/aa_spec.rb +0 -40
data/specs/validator/background_spec.rb +0 -67
data/specs/validator/bias_spec.rb +0 -122
data/specs/validator/decoy_spec.rb +0 -51
data/specs/validator/fasta_helper.rb +0 -26
data/specs/validator/prot_from_pep_spec.rb +0 -141
data/specs/validator/transmem_spec.rb +0 -146
data/specs/validator/true_pos_spec.rb +0 -58
data/specs/validator_helper.rb +0 -33
data/specs/xml_spec.rb +0 -12
data/test_files/000_pepxml18_small.xml +0 -206
data/test_files/020a.mzXML.timeIndex +0 -4710
data/test_files/4-03-03_mzXML/000.mzXML.timeIndex +0 -3973
data/test_files/4-03-03_mzXML/020.mzXML.timeIndex +0 -3872
data/test_files/4-03-03_small-prot.xml +0 -321
data/test_files/4-03-03_small.xml +0 -3876
data/test_files/7MIX_STD_110802_1.sequest_params_fragment.srf +0 -0
data/test_files/bioworks-3.3_10prots.xml +0 -5999
data/test_files/bioworks31.params +0 -77
data/test_files/bioworks32.params +0 -62
data/test_files/bioworks33.params +0 -63
data/test_files/bioworks_single_run_small.xml +0 -7237
data/test_files/bioworks_small.fasta +0 -212
data/test_files/bioworks_small.params +0 -63
data/test_files/bioworks_small.phobius +0 -109
data/test_files/bioworks_small.toppred.out +0 -2847
data/test_files/bioworks_small.xml +0 -5610
data/test_files/bioworks_with_INV_small.xml +0 -3753
data/test_files/bioworks_with_SHUFF_small.xml +0 -2503
data/test_files/corrupted_900.srf +0 -0
data/test_files/head_of_7MIX.srf +0 -0
data/test_files/interact-opd1_mods_small-prot.xml +0 -304
data/test_files/messups.fasta +0 -297
data/test_files/opd1/000.my_answer.100lines.xml +0 -101
data/test_files/opd1/000.tpp_1.2.3.first10.xml +0 -115
data/test_files/opd1/000.tpp_2.9.2.first10.xml +0 -126
data/test_files/opd1/000.v2.1.mzXML.timeIndex +0 -3748
data/test_files/opd1/000_020-prot.png +0 -0
data/test_files/opd1/000_020_3prots-prot.mod_initprob.xml +0 -62
data/test_files/opd1/000_020_3prots-prot.xml +0 -62
data/test_files/opd1/opd1_cat_inv_small-prot.xml +0 -139
data/test_files/opd1/sequest.3.1.params +0 -77
data/test_files/opd1/sequest.3.2.params +0 -62
data/test_files/opd1/twenty_scans.mzXML +0 -418
data/test_files/opd1/twenty_scans.v2.1.mzXML +0 -382
data/test_files/opd1/twenty_scans_answ.lmat +0 -0
data/test_files/opd1/twenty_scans_answ.lmata +0 -9
data/test_files/opd1_020_beginning.RAW +0 -0
data/test_files/opd1_2runs_2mods/data/020.mzData.xml +0 -683
data/test_files/opd1_2runs_2mods/data/020.readw.mzXML +0 -382
data/test_files/opd1_2runs_2mods/data/040.mzData.xml +0 -683
data/test_files/opd1_2runs_2mods/data/040.readw.mzXML +0 -382
data/test_files/opd1_2runs_2mods/data/README.txt +0 -6
data/test_files/opd1_2runs_2mods/interact-opd1_mods__small.xml +0 -753
data/test_files/orbitrap_mzData/000_cut.xml +0 -1920
data/test_files/pepproph_small.xml +0 -4691
data/test_files/phobius.small.noheader.txt +0 -50
data/test_files/phobius.small.small.txt +0 -53
data/test_files/s01_anC1_ld020mM.key.txt +0 -25
data/test_files/s01_anC1_ld020mM.meth +0 -0
data/test_files/small.fasta +0 -297
data/test_files/small.sqt +0 -87
data/test_files/smallraw.RAW +0 -0
data/test_files/tf_bioworks2excel.bioXML +0 -14340
data/test_files/tf_bioworks2excel.txt.actual +0 -1035
data/test_files/toppred.small.out +0 -416
data/test_files/toppred.xml.out +0 -318
data/test_files/validator_hits_separate/bias_bioworks_small_HS.fasta +0 -7
data/test_files/validator_hits_separate/bioworks_small_HS.xml +0 -5651
data/test_files/yeast_gly_small-prot.xml +0 -265
data/test_files/yeast_gly_small.1.0_1.0_1.0.parentTimes +0 -6
data/test_files/yeast_gly_small.xml +0 -3807
data/test_files/yeast_gly_small2.parentTimes +0 -6

data/README CHANGED Viewed

@@ -1,3 +1,12 @@
+= {Mspire}[http://mspire.rubyforge.org]
+A library for working with mass spectrometry proteomics data.
+<em> Mspire is going through a re-write as of version 0.5.0 to support a new
+development model.  Many modules are absent but will gradually be added back.
+Use the 0.4 releases as necessary. </em>
+== Description
 mspire - 'Mass Spectrometry Proteomics in Ruby' is a collection of tools for
 working with MS proteomics data in ruby.  It seeks to provide support for open
@@ -5,19 +14,22 @@ standards (e.g., parsers for mzData, mzXML, Peptide/Protein Prophet and the
 TPP) and contribute other useful functionality for working with mass
 spectrometry data in ruby.
-Current Focus
--------------
+* Lighthouse[http://bahuvrihi.lighthouseapp.com/projects/16692-mspire/tickets]
+* Github[http://github.com/bahuvrihi/mspire/tree/master]
+* {Google Group}[http://groups.google.com/group/mspire-forum]
+--
+=== Current Focus
 The project is currently focusing on the following:
-* SEQUEST data (particularly the output of Bioworks 3.2-3.3)
+* SEQUEST data (particularly the output of Bioworks 3.2-3.3.1)
 * mzXML
 * mzData
 * ProteinProphet
 * Preparation of files for [obiwarp](http://obi-warp.sourceforge.net/)
-Features
---------
+=== Features
 * mzXML (version 1, 2, and 3) parsing
 * mzData parsing
@@ -37,31 +49,29 @@ Validation by:
   * Generic sample bias (e.g., low abundance/high abundance proteins)
   * Defined sample
-Spectra and Spectra Identification
-----------------------------------
+=== Spectra and Spectra Identification
 The [MS](ms/index.html) namespace contains objects for working with mass spectra and associated file formats.
 The [SpecID](spec_id/index.html) namespace contains objects for working with spectral identifications.
-Tutorials
----------
+=== Tutorials
 * [Database Searching Tutorial](tutorial/database_searching/index.html) -
 Demonstrates two methods for running and analysing Bioworks output to obtain
 false positive rates using mspire executables.
+++
+== Installation
-Warning
--------
+Mspire is available as a gem on RubyForge[http://rubyforge.org/projects/mspire].  Use:
+  % gem install mspire
+= Warning
 This is an experimental package.  As such, all versions prior to version 1.0
 may contain interface changes on minor revisions (major.minor.build) (e.g.,
 0.4.0 may contain interface change from 0.3.9).  Beyond version 1.0, the
 versioning scheme will be strictly adhered to (no interface changes except on
 major revisions).
-Installation
-------------
-see [Install](install/index.html)

data/changelog.txt CHANGED Viewed

@@ -1,5 +1,5 @@
-## version 0.1.7
+== version 0.1.7
 1. A couple of scripts and subroutines were hashing peptides but not on the file
 basename.  This would result in slightly incorrect results (any time there
@@ -33,33 +33,33 @@ Rate' and 'FPR' from the package. It's been suggested that FP/(TP+FP) be
 called the False Positive Predictive Rate (FPPR).  I will probably implement
 this in a future release.
-## version 0.2.0
+== version 0.2.0
 Revamped the way SpecID works (it is now mixed-in).
 Added support for modifications to bioworks_to_pepxml.rb
 Can read .srf files (nearly interchangeable with bioworks files)
 Redid filter.rb
-## version 0.2.1
+== version 0.2.1
 minor bugfix
-## version 0.2.2
+== version 0.2.2
 made compatible with Bioworks fasta file reverser and updated tutorial.
 Killed classify_by_prefix routine in favor of classify_by_false_flag which has
 a prefix option
-## version 0.2.3
+== version 0.2.3
 in protein_summary.rb added handling for proteins with no annotation. (either
 dispaly NA or use gi2annnot to grab them from NCBI)
-## version 0.2.5
+== version 0.2.5
 renamed prep_list in roc (potential breaks in code)
-## version 0.2.6
+== version 0.2.6
 1. Massive refactorization of filtering and validation.  Validation objects are
 created and then can be used to validate just about anything.
@@ -75,105 +75,105 @@ appear to influence our analyses, however). Fixed.
 2. Enzymes with no exceptions (e.g., cuts at KR) would report one too many
 missed cleavages if the last amino acid was a cut point. Fixed.
-## version 0.2.7
+== version 0.2.7
 1. In conversion from bioworks to pepxml, the default was trypsin (KR/P).
 Now, the sample enzyme is set explicitly from the params file and the option
 is not available.  This can give more accuract pepxml files than from
 previous depending on your enzyme.
-## version 0.2.9
+== version 0.2.9
 1. Added support for phobius transmembrane predictions
 2. have filter_and_validate.rb working well (multiple validators allowed).
 3. Can read bioworks 3.3.1 .srf files (.srf version 3.5 files)
 4. Added a bias validator
-## version 0.2.10
+== version 0.2.10
 1. Fixed --hits_separate flag in spec_id/filter
-## version 0.2.11
+== version 0.2.11
 1. Added prob precision support and reorganized filter_and_validate libs
-## version 0.2.12
+== version 0.2.12
 1. Fixed bug in transmem for prob and others.
 2. Can use axml (XMLParser based) or libxml depending on availability
-## version 0.2.13
+== version 0.2.13
 1. Fixed issue with --hits_separate
 2. filter_and_validate.rb requires decoy validator if decoy proteins
 (refactored code)
-## version 0.2.14
+== version 0.2.14
 1. Can read PeptideProphet files (should be able to read pepxml files, too)
 2. API change: Some slight modifications to the Sequest::PepXML object
 interfaces and implementations (using ArrayClass)
-## version 0.2.15
+== version 0.2.15
 1. can convert srf files to sqt files
-## version 0.3.0
+== version 0.3.0
 1. IMPORTANT BUG FIX: protein reporting in srf files is correct now (proteins after the first protein were being assigned to the last hit in an out file).
 2. SQT export is correct and works at least on 3.2 and 3.3.1.
-## version 0.3.1
+== version 0.3.1
 1. Bug fix in srf filtering (num_hits adjusted)
-## version 0.3.2
+== version 0.3.2
 1. Uses sequest peptide_mass_tolerance filter on srf group files by default
 now.
-## version 0.3.3
+== version 0.3.3
 1. Worked out minor kinks in prob_precision.rb
-## version 0.3.4
+== version 0.3.4
 1. filters >= +3 charged ions now.
-## version 0.3.5
+== version 0.3.5
 1. fixed creation of background distribution in validators (hash_by base_name,
 first_scan, charge now)
-## version 0.3.6
+== version 0.3.6
 1. split off bad_aa_est from bad_aa
-## version 0.3.7
+== version 0.3.7
 1. can deal with No_Enzyme searches now (while still capable of setting
 sample_enzyme)
-## version 0.3.8
+== version 0.3.8
 1. can set a decoy to target ratio for decoy validation
 2. added mass calculator in Mass::Calculator
-## version 0.3.9
+== version 0.3.9
 1. doesn't clobber mzdata filename in ms_to_lmat.rb conversion
-## version 0.3.10
+== version 0.3.10
 1. added run_percolator.rb script which makes running multiple files easy
-## version 0.3.11
+== version 0.3.11
 1. faster sensing of bad scan tags in mzXML v. 2.0 files
 2. implemented lazy evaluation of spectrum in 2 different ways allowing much
 larger files to be parsed
-## version 0.4.0
+== version 0.4.0
 1. ** INTERFACE CHANGE: each scan can only have one precursor (used to be an array)
 2. ** INTERFACE CHANGE: spectrum mz and intensity data accessed with mzs and intensities
@@ -182,46 +182,15 @@ larger files to be parsed
 eval methos (however, the method intensity_at_mz will still work (causing
 evaluation))
-## version 0.4.1
+== version 0.4.1
 1. added support for reading mzXML version 3.0 (may fail in some cases)
-## version 0.4.2
+== version 0.4.2
 1. added MS::MSRun.open method
 2. added method to write dta files from SRF
-## version 0.4.3
+== version 0.4.3
 1. added to_mfg_file from SRF
-2. added to_dta_files from SRF complete with streaming .tar.gz output (and
-supporting .zip output but it has to make tmp files)
-## version 0.4.4
-1. implemented q-value and pi_0 methods of Storey
-2. can do complete q-value calculations given p-values
-3. can determine a pi_0 given a list of target and decoy values (as booleans)
-4. can determine a pi_0 given a list containing numbers of decoy and target
-values as is often encountered with filtering
-5. prob_validate.rb implements a q-value option for turning PeptideProphet
-probabilities into q-values
-6. filter_validate.rb implements a p value method using xcorr values, however,
-this is not very effective since xcorr values underrepresent the the
-difference between good hits and bad hits
-## version 0.4.5
-1. using pi_zero instead of decoy_to_target_ratio.  While all tests are
-passing, this release should be considered experimental with the use of any
-target-decoy validation.
-## version 0.4.6
-1. added NOTE to --to_qvalues option to include all results (no low prob
-filter)
-## version 0.4.7
-1. Added ability to quickly grab sequest params out of a .SRF file
-2. Added helpful runtime error if print_duplicate_references is 0.
-## version 0.4.9
-1. quiet some unneeded output and fixed truncation of filenames with '.'
-inside them.

data/lib/ms/calc.rb ADDED Viewed

@@ -0,0 +1,32 @@
+module Ms
+  module Calc
+      module_function
+      #
+      # ppm calculations... maybe use RUnit
+      #
+      def ppm_tol_at(mz, ppm)
+        1.0 * mz * ppm / 10**6
+      end
+      def ppm_span_at(mz, ppm)
+        tol = ppm_tol_at(mz, ppm)
+        [mz-tol, mz+tol]
+      end
+      def ppm_range_at(mz, ppm)
+        mz = mz.to_f
+        tol = ppm_tol_at(mz, ppm)
+        mz-tol...mz+tol
+      end
+      # Rounds n to the specified precision (ie number of decimal places)
+      # def round(n, precision)
+      #   factor = 10**precision.to_i
+      #   (n * factor).round.to_f / factor
+      # end
+  end
+end

data/lib/ms/data/interleaved.rb ADDED Viewed

@@ -0,0 +1,60 @@
+require 'ms/data/simple'
+module Ms
+  module Data
+    module_function
+    # Initializes a new interleaved data array.
+    def new_interleaved(unresolved_data, n=2)
+      Interleaved.new(unresolved_data, n=2)
+    end
+    # An Interleaved data array lazily evaluates it's unresolved data as
+    # an interleaved array of n members.  The unresolved data is evaluated
+    # into an array using to_a.
+    #
+    #   i = Ms::Data::Interleaved.new([1,4,2,5,3,6])
+    #   i.unresolved_data    # => [1,4,2,5,3,6]
+    #   i.data               # => []
+    #   i[0]                 # => [1,2,3]
+    #   i[1]                 # => [4,5,6]
+    #   i.data               # => [[1,2,3], [4,5,6]]
+    #
+    class Interleaved < Simple
+      attr_reader :n
+      def initialize(unresolved_data, n=2)
+        @n = 2
+        super(unresolved_data)
+      end
+      def [](index)
+        resolve.data[index]
+      end
+      def resolved?
+        !@data.empty?
+      end
+      def resolve
+        return(self) if resolved?
+        unresolved_data = @unresolved_data.to_a
+        unless unresolved_data.length % n == 0
+          raise ArgumentError, "interleaved data must have a number of elements evenly divisible by n (#{n})"
+        end
+        n.times { @data << [] }
+        map = @data * (unresolved_data.length/n)
+        unresolved_data.each_with_index do |item, i|
+          map[i] << item
+        end
+        self
+      end
+    end
+  end
+end

data/lib/ms/data/lazy_io.rb ADDED Viewed

@@ -0,0 +1,73 @@
+module Ms
+  module Data
+    # LazyIO represents data to be lazily read from an IO.  To read the data
+    # from the IO, either string or to_a may be called (to_a unpacks the
+    # string into an array using the decode_format and unpack_format).
+    #
+    # LazyIO is a suitable unresolved_data source for Ms::Data formats.
+    class LazyIO
+      NETWORK_FLOAT = 'g*'
+      NETWORK_DOUBLE = 'G*'
+      LITTLE_ENDIAN_FLOAT = 'e*'
+      LITTLE_ENDIAN_DOUBLE = 'E*'
+      BASE_64 = 'm'
+      class << self
+        # Returns the unpacking code for the given precision (32 or 64-bit)
+        # and network order (true for big-endian).
+        def unpack_code(precision, network_order)
+          case precision
+          when 32 then network_order ? NETWORK_FLOAT : LITTLE_ENDIAN_FLOAT
+          when 64 then network_order ? NETWORK_DOUBLE : LITTLE_ENDIAN_DOUBLE
+          else raise ArgumentError, "unknown precision (should be 32 or 64): #{precision}"
+          end
+        end
+      end
+      # The IO from which string is read
+      attr_reader :io
+      # The start index for reading string
+      attr_reader :start_index
+      # The number of bytes to be read from io when evaluating string
+      attr_reader :num_bytes
+      # Indicates the unpacking format
+      attr_reader :unpack_format
+      # Indicates a decoding format, may be false to unpack string
+      # without decoding.
+      attr_reader :decode_format
+      def initialize(io, start_index=io.pos, num_bytes=nil, unpack_format=NETWORK_FLOAT, decode_format=BASE_64)
+        @io = io
+        @start_index = start_index
+        @num_bytes = num_bytes
+        @unpack_format = unpack_format
+        @decode_format = decode_format
+      end
+      # Positions io at start_index and reads a string of num_bytes length.
+      # The string is newly read from io each time string is called.
+      def string
+        io.pos = start_index unless io.pos == start_index
+        io.read(num_bytes)
+      end
+      # Resets the cached array (returned by to_a) so that the array will
+      # be re-read from io.
+      def reset
+        @array = nil
+      end
+      # Reads string and unpacks using decode_format and unpack_code.  The
+      # array is cached internally; to re-read the array, use reset.
+      def to_a
+        @array ||= (decode_format ? string.unpack(decode_format)[0] : string).unpack(unpack_format)
+      end
+    end
+  end
+end

data/lib/ms/data/lazy_string.rb ADDED Viewed

@@ -0,0 +1,15 @@
+require 'ms/data/lazy_io'
+require 'stringio'
+module Ms
+  module Data
+    # LazyString is a LazyIO initialized from a string, which is converted into
+    # a StringIO.
+    class LazyString < LazyIO
+      def initialize(string, unpack_format=NETWORK_FLOAT, decode_format=BASE_64)
+        super(StringIO.new(string), 0, string.length, unpack_format, decode_format)
+      end
+    end
+  end
+end

data/lib/ms/data/simple.rb ADDED Viewed

@@ -0,0 +1,59 @@
+module Ms
+  module Data
+    module_function
+    # Initializes a new simple data array.
+    def new_simple(unresolved_data)
+      Simple.new(unresolved_data)
+    end
+    # A Simple data array that lazily evaluates unresolved_data, and
+    # each member of unresolved_data using to_a:
+    #
+    #   class LazyObject
+    #     attr_reader :to_a
+    #     def initialize(array)
+    #       @to_a = array
+    #     end
+    #   end
+    #
+    #   a = LazyObject.new([1,2,3])
+    #   b = LazyObject.new([4,5,6])
+    #   s = Ms::Data::Simple.new([a, b])
+    #
+    #   s.unresolved_data     # => [a, b]
+    #   s.data                # => []
+    #   s[0]                  # => [1,2,3]
+    #   s[1]                  # => [4,5,6]
+    #   s.data                # => [[1,2,3], [4,5,6]]
+    #
+    class Simple
+      # The underlying resolved data store.
+      attr_reader :data
+      # The underlying unresolved data store.
+      attr_reader :unresolved_data
+      def initialize(unresolved_data)
+        @data = []
+        @unresolved_data = unresolved_data
+      end
+      def [](index)
+        @data[index] ||= @unresolved_data.to_a[index].to_a
+      end
+      def resolve
+        0.upto(@unresolved_data.length - 1) do |index|
+          self[index]
+        end unless resolved?
+        self
+      end
+      def resolved?
+        @data.compact.length == @unresolved_data.length
+      end
+    end
+  end
+end

data/lib/ms/data/transposed.rb ADDED Viewed

@@ -0,0 +1,41 @@
+require 'ms/data/simple'
+module Ms
+  module Data
+    module_function
+    # Initializes a new transposed data array.
+    def new_transposed(unresolved_data)
+      Transposed.new(unresolved_data)
+    end
+    # A Transposed data array lazily evaluates it's unresolved data as
+    # a transposed array.  The unresolved data is evaluated
+    # into an array using to_a.
+    #
+    #   t = Ms::Data::Transposed.new([[1,4],[2,5],[3,6]])
+    #
+    #   t.unresolved_data  # => [[1,4],[2,5],[3,6]]
+    #   t.data             # => []
+    #   t[0]               # => [1,2,3]
+    #   t[1]               # => [4,5,6]
+    #   t.data             # => [[1,2,3], [4,5,6]]
+    #
+    class Transposed < Simple
+      def [](index)
+        resolve.data[index]
+      end
+      def resolved?
+        !@data.empty?
+      end
+      def resolve
+        @data = @unresolved_data.to_a.transpose unless resolved?
+        self
+      end
+    end
+  end
+end

data/lib/ms/data.rb ADDED Viewed

@@ -0,0 +1,57 @@
+require 'ms/data/interleaved'
+require 'ms/data/transposed'
+module Ms
+  # The Data module contains a number of classes providing a standard way to
+  # resolve various data storage formats into a 'simple' data array.
+  #
+  #   type               format
+  #   simple             [[mzs,...], [intensities...]]
+  #   transposed         [[mz,intensity], [mz,intensity], ...]
+  #   interleaved        [mz,intensity,mz,intensity,...]
+  #
+  # For instance:
+  #
+  #   s = Data.new([[1,2,3], [4,5,6]], :simple)
+  #   s.resolve.data        # => [[1,2,3], [4,5,6]]
+  #
+  #   t = Data.new([[1,4],[2,5],[3,6]], :transposed)
+  #   t.resolve.data        # => [[1,2,3], [4,5,6]]
+  #
+  #   i = Data.new([1,4,2,5,3,6], :interleaved)
+  #   i.resolve.data        # => [[1,2,3], [4,5,6]]
+  #
+  # Data is always resolved by calling to_a on the unresolved data object
+  # and then rearranging as needed (in the case of simple data, to_a is
+  # also called on each member of the unresolved data array).  This lazy
+  # resolution allows the use of non-array unresolved_data objects such
+  # as Data::LazyString:
+  #
+  #   str = [[1,4,2,5,3,6].pack("g*")].pack("m")
+  #   unresolved_data = Data::LazyString.new(str)
+  #
+  #   i = Data.new(unresolved_data, :interleaved)
+  #   i.resolve.data        # => [[1,2,3], [4,5,6]]
+  #
+  # Obviously the big advantage of lazy data resolution is that Data objects
+  # may be instantiated cheaply while expensive operations like unpacking and
+  # rearrangement may be put off or not executed at all.
+  #
+  module Data
+    module_function
+    # Initializes a new data array of the specified type by forwarding
+    # data to the "new_<type>" method.
+    #
+    #   simple = Ms::Data.new([[1,2,3], [4,5,6]], :simple)
+    #   simple.class           # => Ms::Data::Simple
+    #
+    #   interleaved = Ms::Data.new([1,4,2,5,3,6], :interleaved)
+    #   interleaved.class      # => Ms::Data::Interleaved
+    #
+    def new(data, type=:simple)
+      send("new_#{type}", data)
+    end
+  end
+end

data/lib/ms/format/format_error.rb ADDED Viewed

@@ -0,0 +1,12 @@
+module Ms
+  module Format
+    class FormatError < Exception
+      attr_accessor :str
+      def initialize(msg, str)
+        super(msg)
+        @str = str
+      end
+    end
+  end
+end