RubyGems - mspire - Versions diffs - 0.4.9 → 0.5.0 - Mend

mspire 0.4.9 → 0.5.0

Files changed (255) hide show

data/README +27 -17
data/changelog.txt +31 -62
data/lib/ms/calc.rb +32 -0
data/lib/ms/data/interleaved.rb +60 -0
data/lib/ms/data/lazy_io.rb +73 -0
data/lib/ms/data/lazy_string.rb +15 -0
data/lib/ms/data/simple.rb +59 -0
data/lib/ms/data/transposed.rb +41 -0
data/lib/ms/data.rb +57 -0
data/lib/ms/format/format_error.rb +12 -0
data/lib/ms/spectrum.rb +25 -384
data/lib/ms/support/binary_search.rb +126 -0
data/lib/ms.rb +10 -10
metadata +38 -350
data/INSTALL +0 -58
data/README.rdoc +0 -18
data/Rakefile +0 -330
data/bin/aafreqs.rb +0 -23
data/bin/bioworks2excel.rb +0 -14
data/bin/bioworks_to_pepxml.rb +0 -148
data/bin/bioworks_to_pepxml_gui.rb +0 -225
data/bin/fasta_shaker.rb +0 -5
data/bin/filter_and_validate.rb +0 -5
data/bin/gi2annot.rb +0 -14
data/bin/id_class_anal.rb +0 -112
data/bin/id_precision.rb +0 -172
data/bin/ms_to_lmat.rb +0 -67
data/bin/pepproph_filter.rb +0 -16
data/bin/prob_validate.rb +0 -6
data/bin/protein_summary.rb +0 -6
data/bin/protxml2prots_peps.rb +0 -32
data/bin/raw_to_mzXML.rb +0 -55
data/bin/run_percolator.rb +0 -122
data/bin/sqt_group.rb +0 -26
data/bin/srf_group.rb +0 -27
data/bin/srf_to_sqt.rb +0 -40
data/lib/align/chams.rb +0 -78
data/lib/align.rb +0 -154
data/lib/archive/targz.rb +0 -94
data/lib/bsearch.rb +0 -120
data/lib/core_extensions.rb +0 -16
data/lib/fasta.rb +0 -626
data/lib/gi.rb +0 -124
data/lib/group_by.rb +0 -10
data/lib/index_by.rb +0 -11
data/lib/merge_deep.rb +0 -21
data/lib/ms/converter/mzxml.rb +0 -77
data/lib/ms/gradient_program.rb +0 -170
data/lib/ms/msrun.rb +0 -244
data/lib/ms/msrun_index.rb +0 -108
data/lib/ms/parser/mzdata/axml.rb +0 -67
data/lib/ms/parser/mzdata/dom.rb +0 -175
data/lib/ms/parser/mzdata/libxml.rb +0 -7
data/lib/ms/parser/mzdata.rb +0 -31
data/lib/ms/parser/mzxml/axml.rb +0 -70
data/lib/ms/parser/mzxml/dom.rb +0 -182
data/lib/ms/parser/mzxml/hpricot.rb +0 -253
data/lib/ms/parser/mzxml/libxml.rb +0 -19
data/lib/ms/parser/mzxml/regexp.rb +0 -122
data/lib/ms/parser/mzxml/rexml.rb +0 -72
data/lib/ms/parser/mzxml/xmlparser.rb +0 -248
data/lib/ms/parser/mzxml.rb +0 -282
data/lib/ms/parser.rb +0 -108
data/lib/ms/precursor.rb +0 -25
data/lib/ms/scan.rb +0 -81
data/lib/mspire.rb +0 -4
data/lib/pi_zero.rb +0 -244
data/lib/qvalue.rb +0 -161
data/lib/roc.rb +0 -187
data/lib/sample_enzyme.rb +0 -160
data/lib/scan_i.rb +0 -21
data/lib/spec_id/aa_freqs.rb +0 -170
data/lib/spec_id/bioworks.rb +0 -497
data/lib/spec_id/digestor.rb +0 -138
data/lib/spec_id/mass.rb +0 -179
data/lib/spec_id/parser/proph.rb +0 -335
data/lib/spec_id/precision/filter/cmdline.rb +0 -218
data/lib/spec_id/precision/filter/interactive.rb +0 -134
data/lib/spec_id/precision/filter/output.rb +0 -148
data/lib/spec_id/precision/filter.rb +0 -637
data/lib/spec_id/precision/output.rb +0 -60
data/lib/spec_id/precision/prob/cmdline.rb +0 -160
data/lib/spec_id/precision/prob/output.rb +0 -94
data/lib/spec_id/precision/prob.rb +0 -249
data/lib/spec_id/proph/pep_summary.rb +0 -104
data/lib/spec_id/proph/prot_summary.rb +0 -484
data/lib/spec_id/proph.rb +0 -4
data/lib/spec_id/protein_summary.rb +0 -489
data/lib/spec_id/sequest/params.rb +0 -316
data/lib/spec_id/sequest/pepxml.rb +0 -1458
data/lib/spec_id/sequest.rb +0 -33
data/lib/spec_id/sqt.rb +0 -349
data/lib/spec_id/srf.rb +0 -973
data/lib/spec_id.rb +0 -778
data/lib/spec_id_xml.rb +0 -99
data/lib/transmem/phobius.rb +0 -147
data/lib/transmem/toppred.rb +0 -368
data/lib/transmem.rb +0 -157
data/lib/validator/aa.rb +0 -48
data/lib/validator/aa_est.rb +0 -112
data/lib/validator/background.rb +0 -77
data/lib/validator/bias.rb +0 -95
data/lib/validator/cmdline.rb +0 -431
data/lib/validator/decoy.rb +0 -107
data/lib/validator/digestion_based.rb +0 -70
data/lib/validator/probability.rb +0 -51
data/lib/validator/prot_from_pep.rb +0 -234
data/lib/validator/q_value.rb +0 -32
data/lib/validator/transmem.rb +0 -272
data/lib/validator/true_pos.rb +0 -46
data/lib/validator.rb +0 -197
data/lib/xml.rb +0 -38
data/lib/xml_style_parser.rb +0 -119
data/lib/xmlparser_wrapper.rb +0 -19
data/release_notes.txt +0 -2
data/script/compile_and_plot_smriti_final.rb +0 -97
data/script/create_little_pepxml.rb +0 -61
data/script/degenerate_peptides.rb +0 -47
data/script/estimate_fpr_by_cysteine.rb +0 -226
data/script/extract_gradient_programs.rb +0 -56
data/script/find_cysteine_background.rb +0 -137
data/script/genuine_tps_and_probs.rb +0 -136
data/script/get_apex_values_rexml.rb +0 -44
data/script/histogram_probs.rb +0 -61
data/script/mascot_fix_pepxml.rb +0 -123
data/script/msvis.rb +0 -42
data/script/mzXML2timeIndex.rb +0 -25
data/script/peps_per_bin.rb +0 -67
data/script/prep_dir.rb +0 -121
data/script/simple_protein_digestion.rb +0 -27
data/script/smriti_final_analysis.rb +0 -103
data/script/sqt_to_meta.rb +0 -24
data/script/top_hit_per_scan.rb +0 -67
data/script/toppred_to_yaml.rb +0 -47
data/script/tpp_installer.rb +0 -249
data/specs/align_spec.rb +0 -79
data/specs/bin/bioworks_to_pepxml_spec.rb +0 -79
data/specs/bin/fasta_shaker_spec.rb +0 -259
data/specs/bin/filter_and_validate__multiple_vals_helper.yaml +0 -199
data/specs/bin/filter_and_validate_spec.rb +0 -180
data/specs/bin/ms_to_lmat_spec.rb +0 -34
data/specs/bin/prob_validate_spec.rb +0 -86
data/specs/bin/protein_summary_spec.rb +0 -14
data/specs/fasta_spec.rb +0 -354
data/specs/gi_spec.rb +0 -22
data/specs/load_bin_path.rb +0 -7
data/specs/merge_deep_spec.rb +0 -13
data/specs/ms/gradient_program_spec.rb +0 -77
data/specs/ms/msrun_spec.rb +0 -498
data/specs/ms/parser_spec.rb +0 -92
data/specs/ms/spectrum_spec.rb +0 -87
data/specs/pi_zero_spec.rb +0 -115
data/specs/qvalue_spec.rb +0 -39
data/specs/roc_spec.rb +0 -251
data/specs/rspec_autotest.rb +0 -149
data/specs/sample_enzyme_spec.rb +0 -126
data/specs/spec_helper.rb +0 -135
data/specs/spec_id/aa_freqs_spec.rb +0 -52
data/specs/spec_id/bioworks_spec.rb +0 -148
data/specs/spec_id/digestor_spec.rb +0 -75
data/specs/spec_id/precision/filter/cmdline_spec.rb +0 -20
data/specs/spec_id/precision/filter/output_spec.rb +0 -31
data/specs/spec_id/precision/filter_spec.rb +0 -246
data/specs/spec_id/precision/prob_spec.rb +0 -44
data/specs/spec_id/precision/prob_spec_helper.rb +0 -0
data/specs/spec_id/proph/pep_summary_spec.rb +0 -98
data/specs/spec_id/proph/prot_summary_spec.rb +0 -128
data/specs/spec_id/protein_summary_spec.rb +0 -189
data/specs/spec_id/sequest/params_spec.rb +0 -68
data/specs/spec_id/sequest/pepxml_spec.rb +0 -374
data/specs/spec_id/sequest_spec.rb +0 -38
data/specs/spec_id/sqt_spec.rb +0 -246
data/specs/spec_id/srf_spec.rb +0 -172
data/specs/spec_id/srf_spec_helper.rb +0 -139
data/specs/spec_id_helper.rb +0 -33
data/specs/spec_id_spec.rb +0 -366
data/specs/spec_id_xml_spec.rb +0 -33
data/specs/transmem/phobius_spec.rb +0 -425
data/specs/transmem/toppred_spec.rb +0 -298
data/specs/transmem_spec.rb +0 -60
data/specs/transmem_spec_shared.rb +0 -64
data/specs/validator/aa_est_spec.rb +0 -66
data/specs/validator/aa_spec.rb +0 -40
data/specs/validator/background_spec.rb +0 -67
data/specs/validator/bias_spec.rb +0 -122
data/specs/validator/decoy_spec.rb +0 -51
data/specs/validator/fasta_helper.rb +0 -26
data/specs/validator/prot_from_pep_spec.rb +0 -141
data/specs/validator/transmem_spec.rb +0 -146
data/specs/validator/true_pos_spec.rb +0 -58
data/specs/validator_helper.rb +0 -33
data/specs/xml_spec.rb +0 -12
data/test_files/000_pepxml18_small.xml +0 -206
data/test_files/020a.mzXML.timeIndex +0 -4710
data/test_files/4-03-03_mzXML/000.mzXML.timeIndex +0 -3973
data/test_files/4-03-03_mzXML/020.mzXML.timeIndex +0 -3872
data/test_files/4-03-03_small-prot.xml +0 -321
data/test_files/4-03-03_small.xml +0 -3876
data/test_files/7MIX_STD_110802_1.sequest_params_fragment.srf +0 -0
data/test_files/bioworks-3.3_10prots.xml +0 -5999
data/test_files/bioworks31.params +0 -77
data/test_files/bioworks32.params +0 -62
data/test_files/bioworks33.params +0 -63
data/test_files/bioworks_single_run_small.xml +0 -7237
data/test_files/bioworks_small.fasta +0 -212
data/test_files/bioworks_small.params +0 -63
data/test_files/bioworks_small.phobius +0 -109
data/test_files/bioworks_small.toppred.out +0 -2847
data/test_files/bioworks_small.xml +0 -5610
data/test_files/bioworks_with_INV_small.xml +0 -3753
data/test_files/bioworks_with_SHUFF_small.xml +0 -2503
data/test_files/corrupted_900.srf +0 -0
data/test_files/head_of_7MIX.srf +0 -0
data/test_files/interact-opd1_mods_small-prot.xml +0 -304
data/test_files/messups.fasta +0 -297
data/test_files/opd1/000.my_answer.100lines.xml +0 -101
data/test_files/opd1/000.tpp_1.2.3.first10.xml +0 -115
data/test_files/opd1/000.tpp_2.9.2.first10.xml +0 -126
data/test_files/opd1/000.v2.1.mzXML.timeIndex +0 -3748
data/test_files/opd1/000_020-prot.png +0 -0
data/test_files/opd1/000_020_3prots-prot.mod_initprob.xml +0 -62
data/test_files/opd1/000_020_3prots-prot.xml +0 -62
data/test_files/opd1/opd1_cat_inv_small-prot.xml +0 -139
data/test_files/opd1/sequest.3.1.params +0 -77
data/test_files/opd1/sequest.3.2.params +0 -62
data/test_files/opd1/twenty_scans.mzXML +0 -418
data/test_files/opd1/twenty_scans.v2.1.mzXML +0 -382
data/test_files/opd1/twenty_scans_answ.lmat +0 -0
data/test_files/opd1/twenty_scans_answ.lmata +0 -9
data/test_files/opd1_020_beginning.RAW +0 -0
data/test_files/opd1_2runs_2mods/data/020.mzData.xml +0 -683
data/test_files/opd1_2runs_2mods/data/020.readw.mzXML +0 -382
data/test_files/opd1_2runs_2mods/data/040.mzData.xml +0 -683
data/test_files/opd1_2runs_2mods/data/040.readw.mzXML +0 -382
data/test_files/opd1_2runs_2mods/data/README.txt +0 -6
data/test_files/opd1_2runs_2mods/interact-opd1_mods__small.xml +0 -753
data/test_files/orbitrap_mzData/000_cut.xml +0 -1920
data/test_files/pepproph_small.xml +0 -4691
data/test_files/phobius.small.noheader.txt +0 -50
data/test_files/phobius.small.small.txt +0 -53
data/test_files/s01_anC1_ld020mM.key.txt +0 -25
data/test_files/s01_anC1_ld020mM.meth +0 -0
data/test_files/small.fasta +0 -297
data/test_files/small.sqt +0 -87
data/test_files/smallraw.RAW +0 -0
data/test_files/tf_bioworks2excel.bioXML +0 -14340
data/test_files/tf_bioworks2excel.txt.actual +0 -1035
data/test_files/toppred.small.out +0 -416
data/test_files/toppred.xml.out +0 -318
data/test_files/validator_hits_separate/bias_bioworks_small_HS.fasta +0 -7
data/test_files/validator_hits_separate/bioworks_small_HS.xml +0 -5651
data/test_files/yeast_gly_small-prot.xml +0 -265
data/test_files/yeast_gly_small.1.0_1.0_1.0.parentTimes +0 -6
data/test_files/yeast_gly_small.xml +0 -3807
data/test_files/yeast_gly_small2.parentTimes +0 -6

data/script/estimate_fpr_by_cysteine.rb DELETED Viewed

@@ -1,226 +0,0 @@
-#!/usr/bin/ruby -w
-## The yeast Scal db mean background is: 0.00984
-## The yeast Cysteine background freq is: 0.0131986582396467
-pep_seq_re = /<search_hit .* peptide="(\w+)"/o
-pep_prob_re = /<peptideprophet_result probability="([\w\.]+)"/o
-if ARGV.size != 3
-  puts "usage #{File.basename(__FILE__)} cysteine_background_freq existing_freq peptide_prophet.xml"
-  puts "  outputs (tab delimited): num_peptides, prob, fpr, cys_estimated_fpr"
-  abort
-end
-def plot(base_toplot, base, title, xaxis, yaxis, hash, cats)
-  File.open(base_toplot, "w") do |fh|
-    fh.puts 'XYData'
-    fh.puts base
-    fh.puts title
-    fh.puts xaxis
-    fh.puts yaxis
-    cats.each do |ar|
-      fh.puts ar.join(" & ")
-      ar.each do |a|
-        fh.puts hash[a].join(" ")
-      end
-    end
-  end
-end
-  ############################################################################
-#### DO NOT MODIFY THIS GUY!  HE IS TAKEN FROM bin/filter_spec_id.rb
-#### CHANGE HIM THERE (eventually we need to put him in a lib file)
-# (actual # with cys, expected # with cys, total#peptides,
-# mean_fraction_of_cysteines_true, std)
-# PepHit(C) = Peptide containing cysteine
-#   # Total PepHit(C)                   # Observed Bad Pep (C)
-#   ------------------ proportional_to  ----------------------
-#   # Total PepHit                      # Total Bad PepHit (X)
-def fpr_by_cysteines(ac_num_with_cys, exp_num_with_cys, total_peptides, mean_fraction_true_cys=nil, std_fraction_true_cys=nil)
-  # the number of bona fide BAD cysteine hits
-  # (some of the cysteine hits (~5%) are true positives)
-  ac_num_with_cys -= exp_num_with_cys * mean_fraction_true_cys if mean_fraction_true_cys
-  if ac_num_with_cys < 0.0 ; ac_num_with_cys = 0.0 end
-  total_number_false = (ac_num_with_cys * total_peptides).to_f/exp_num_with_cys
-  fpr = total_number_false / total_peptides
-  [fpr, total_number_false]
-end
-############################################################################
-(cysteine_background_freq, background_freq, file) = ARGV
-cysteine_background_freq = cysteine_background_freq.to_f
-background_freq = background_freq.to_f
-seq_probs = []
-last_seq_prob = nil
-File.open(file) do |fh|
-  fh.each do |line|
-    if line =~ pep_seq_re
-      ar = Array.new(2)
-      ar[0] = $1
-      seq_probs << ar
-      last_seq_prob = ar
-    elsif line =~ pep_prob_re
-      last_seq_prob[1] = $1.to_f
-    end
-  end
-end
-#seq_probs.each do |seq|
-#  if seq[0] !~ /\w/ || !seq[1].is_a?(Float)
-#    abort "BAD PARSING!!"
-#  end
-#end
-amino_acid_as_st = 'C'
-sorted = seq_probs.sort_by {|v| v[1] }.reverse
-## traverse the peptides
-actual_cys_containing_peps = 0
-expected_cys_containing_peps = 0.0
-current_sum_one_minus_prob = 0.0
-prob_estimated_fpr = 0.0
-pep_cnt = 0
-one_minus_freq = 1.0 - cysteine_background_freq
-## tabulate:
-pep_cnts = []
-probs = []
-prob_fprs = []
-prob_tps = []
-cys_fprs = []
-cys_tps = []
-fpr_diff = []
-sorted.each do |ar|
-  pep_cnt += 1
-  pep = ar[0]
-  prob = ar[1]
-  ## Cysteine FPR: ##
-  # Expected:
-  expected_cys_containing_peps += (1.0 - (one_minus_freq**pep.size))
-  # Actual:
-  if pep.include?(amino_acid_as_st)
-    actual_cys_containing_peps += 1
-  end
-  (cys_fpr, total_num_false_by_cys) = fpr_by_cysteines(actual_cys_containing_peps, expected_cys_containing_peps, pep_cnt, background_freq)
-  cys_tp = pep_cnt.to_f - total_num_false_by_cys
-  ## FPR by prob: ##
-  # SUM(1-probX)/#peps
-  current_sum_one_minus_prob += 1.0 - prob
-  prob_estimated_fpr = current_sum_one_minus_prob / pep_cnt
-  prob_tp = pep_cnt.to_f - current_sum_one_minus_prob
-  ## GRAB or report the data:
-  pep_cnts << pep_cnt
-  probs << prob
-  prob_fprs << prob_estimated_fpr
-  prob_tps << prob_tp
-  cys_fprs << cys_fpr
-  cys_tps << cys_tp
-  fpr_diff << prob_estimated_fpr - cys_fpr
-  #puts [pep_cnt, prob, prob_estimated_fpr, cys_fpr].join("\t")
-end
-hash = {
-  'pep_cnts' => pep_cnts,
-  'probs' => probs,
-  'prob_fprs' => prob_fprs,
-  'prob_tps' => prob_tps,
-  'cys_fprs' => cys_fprs,
-  'cys_tps' => cys_tps,
-  'fpr_diff' => fpr_diff,
-}
-real_base = file.sub(/\.xml/,'')
-## TPS vs FPR
-base = real_base.dup
-base << "." << "tps_vs_fpr"
-base_toplot = base + '.to_plot'
-title = "Peptide Prophet FPR Estimation (bg: #{background_freq})"
-xaxis = "TPs"
-yaxis = "FPR"
-cats = [['prob_tps', 'prob_fprs'],['cys_tps', 'cys_fprs']]
-plot(base_toplot, base, title, xaxis, yaxis, hash, cats)
-## PEPHITS vs FPR
-base = real_base.dup
-base << "." << "num_pep_hits_vs_fpr"
-base_toplot = base + '.to_plot'
-title = "Peptide Prophet FPR Estimation (bg: #{background_freq})"
-xaxis = "num peptide hits"
-yaxis = "FPR"
-cats = [['pep_cnts', 'prob_fprs'],['pep_cnts', 'cys_fprs']]
-plot(base_toplot, base, title, xaxis, yaxis, hash, cats)
-## PEPHITS VS FPR DIFF
-base = real_base.dup
-base << "." << "num_pep_hits_vs_fpr_diff"
-base_toplot = base + '.to_plot'
-title = "num_pep_hits vs fpr_diff (prob - cysteine) (bg: #{background_freq})"
-xaxis = "num peptide hits"
-yaxis = "FPR diff (prob - cysteine)"
-cats = [['pep_cnts', 'fpr_diff']]
-plot(base_toplot, base, title, xaxis, yaxis, hash, cats)
-## PROB VS FPR DIFF
-base = real_base.dup
-base << "." << "prob_vs_fpr_diff"
-base_toplot = base + '.to_plot'
-title = "peptide prob vs fpr_diff (prob - cysteine) (bg: #{background_freq})"
-xaxis = "peptide probability"
-yaxis = "FPR diff (prob - cysteine)"
-cats = [['probs', 'fpr_diff']]
-plot(base_toplot, base, title, xaxis, yaxis, hash, cats)
-=begin
-returns [number_of_prots, actual_fpr]
-def num_prots_above_fpr(prots, desired_fpr)
-  current_fpr_rate_percent = 0.0
-  previous_fpr_rate_percent = 0.0
-  current_sum_one_minus_prob = 0.0
-  proteins_within_fpr = 0
-  actual_fpr = nil
-  already_found = false
-  prot_cnt = 0
-  prots.each do |prot|
-    prot_cnt += 1
-    # SUM(1-probX)/#prots
-    current_sum_one_minus_prob += 1.0 - prot._probability.to_f
-    current_fpr_rate_percent = (current_sum_one_minus_prob / prot_cnt) * 100
-    if current_fpr_rate_percent > desired_fpr && !already_found
-      actual_fpr = previous_fpr_rate_percent
-      proteins_within_fpr = prot_cnt
-      already_found = true
-    end
-    previous_fpr_rate_percent = current_fpr_rate_percent
-  end
-  [proteins_within_fpr, actual_fpr]
-end
-=end

data/script/extract_gradient_programs.rb DELETED Viewed

@@ -1,56 +0,0 @@
-#!/usr/bin/ruby
-require 'optparse'
-require 'table'
-require 'ms/gradient_program'
-delimiter = "\t"
-table_format = false
-opts = OptionParser.new do |op|
-  op.banner = "#{File.basename(__FILE__)} [OPTIONS] <file>.meth"
-  op.on("-d", "--delimiter <tab|space|format>", "delimiter (tab default)", "format = space delimited, formatted ascii table") do |v|
-    if v == 'space'
-      delimiter = " "
-    elsif v == 'tab'
-      delimiter = "\t"
-    elsif v == 'format'
-      table_format = true
-    else
-      abort "don't recognize #{v}"
-    end
-  end
-end
-opts.parse!
-if ARGV.size == 0
-  puts opts
-  exit
-end
-sets_of_tables = {}
-ARGV.each do |file|
-  File.open(file) do |fh|
-    sets_of_tables[file] = GradientProgram.all_from_handle(fh)
-  end
-end
-sets_of_tables.each do |file, tables|
-  puts "FILE: #{file}"
-  tables.each do |gp|
-    puts "PUMP_TYPE: #{gp.pump_type}"
-    col_labels = ["time(min)", "%A", "%B", "%C", "%D", "ul/min"]
-    data = gp.time_points.map do |tp|
-      line = [tp.time, *(tp.percentages)]
-      line << tp.flow_rate
-    end
-    table = Table.new(data, nil, col_labels)
-    if table_format
-      puts table.to_formatted_string
-    else
-      puts table.to_s(delimiter)
-    end
-  end
-end

data/script/find_cysteine_background.rb DELETED Viewed

@@ -1,137 +0,0 @@
-#!/usr/bin/ruby -w
-require 'vec'
-# FOR SCer yeast db the and orbi mudpit7 the mean_actual_vs_expected fraction
-# is 0.0101409563168847
-# <peptide peptide_sequence="IEAALSDALAALQIEDPSADELR" charge="3" initial_probability="1.00" nsp_adjusted_probability="1.00" ...
-def plot(base_toplot, base, title, xaxis, yaxis, hash, cats)
-  File.open(base_toplot, "w") do |fh|
-    fh.puts 'XYData'
-    fh.puts base
-    fh.puts title
-    fh.puts xaxis
-    fh.puts yaxis
-    cats.each do |ar|
-      fh.puts ar.join(" & ")
-      ar.each do |a|
-        fh.puts hash[a].join(" ")
-      end
-    end
-  end
-  system "plot.rb -w lp --eps_png --noenhanced #{base_toplot}"
-end
-peptide_re = /<peptide peptide_sequence="(\w+)" charge="\d" initial_probability="([\w\.]+)" nsp_adjusted_probability="([\w\.]+)"/o
-unless ARGV.size == 2
-  abort "usage: #{File.basename(__FILE__)} cysteine_background_freq <file>-prot.xml"
-end
-(cysteine_background_freq, file) = ARGV
-# each pep = [nsp_prob, init_prob, SEQUENCE]
-peps = []
-File.open(file) do |fh|
-  fh.each do |line|
-    if line =~ peptide_re
-      peps << [$3.to_f,$2.to_f,$1]
-    end
-  end
-end
-amino_acid_as_st = 'C'
-one_minus_freq = 1.0 - cysteine_background_freq.to_f
-actual_cys_containing_peps = 0
-expected_cys_containing_peps = 0.0
-current_sum_one_minus_prob = 0.0
-prob_estimated_fpr = 0.0
-pep_cnt = 0
-the_probs = []
-the_fractions = []
-special_probs = []
-#peps.sort.reverse.each do |ar|
-#peps.sort.each do |ar|
-peps.sort_by{|pep| (3.0*pep[0]) + pep[1]}.reverse.each do |ar|
-  (nsp_prob, init_prob, pep) = ar
-  ## Cysteine FPR: ##
-  # Expected:
-  expected_cys_containing_peps += (1.0 - (one_minus_freq**pep.size))
-  # Actual:
-  if pep.include?(amino_acid_as_st)
-    actual_cys_containing_peps += 1
-  end
-  fraction_ac_exp = actual_cys_containing_peps.to_f / expected_cys_containing_peps
-  special_prob = (3.0 * nsp_prob) + init_prob
-  ## Get the final fraction
-  #if special_prob < 4.0
-  #  #puts the_fractions.join(" ")
-  #  puts the_fractions.last
-  #  abort
-  #end
-  # gather data to plot
-  the_probs << nsp_prob
-  special_probs << special_prob
-  the_fractions << fraction_ac_exp
-end
-hash = {
-  'probs' => the_probs,
-  'fractions' => the_fractions,
-  'special_probs' => special_probs,
-}
-real_base = file.sub(/\.xml/,'')
-=begin
-## PROB VS FPR DIFF
-base = real_base.dup
-base << "." << "prob_FLIPPED_vs_actual_expected_fraction"
-base_toplot = base + '.to_plot'
-title = "peptide prob (sorted from 0 to 1) vs fraction with cysteines (actual/expected)"
-xaxis = "peptide nsp adjusted probability (sorted secondly by init prob)"
-yaxis = "fraction with cysteines (actual/expected)"
-cats = [['probs', 'fractions']]
-plot(base_toplot, base, title, xaxis, yaxis, hash, cats)
-=end
-=begin
-## PROB VS FPR DIFF
-base = real_base.dup
-base << "." << "prob_vs_actual_expected_fraction"
-base_toplot = base + '.to_plot'
-title = "peptide prob vs fraction with cysteines (actual/expected)"
-xaxis = "peptide nsp adjusted probability (sorted secondly by init prob)"
-yaxis = "fraction with cysteines (actual/expected)"
-cats = [['probs', 'fractions']]
-plot(base_toplot, base, title, xaxis, yaxis, hash, cats)
-=end
-## SPECIAL PROB VS FPR DIFF
-base = real_base.dup
-base << "." << "special_prob_vs_actual_expected_fraction"
-base_toplot = base + '.to_plot'
-title = "peptide prob (special) vs fraction with cysteines (actual/expected)"
-xaxis = "(3 * nsp_prob) + init_prob"
-yaxis = "fraction with cysteines (actual/expected)"
-cats = [['special_probs', 'fractions']]
-plot(base_toplot, base, title, xaxis, yaxis, hash, cats)

data/script/genuine_tps_and_probs.rb DELETED Viewed

@@ -1,136 +0,0 @@
-#!/usr/bin/ruby -w
-# Here is what I plotted: take each id'd pep-prot id'd on the tophit scans -- you will likely have the same pep-prot id'd on multiple scans -- plot the top probability of each such pep-prot.
-# There are 43 such id'd peptides for Sashimi, whereas SEQUEST id's about 66. So you'll have 66 (1-p-values) to plot, I had 43. Similarly for OMICS.
-require 'spec_id'
-require 'fasta'
-require 'optparse'
-require 'ostruct'
-# returns an accession number if available, or the entire reference (less the
-# starting '>'
-def get_fasta_accession(fasta_prot)
-  head = fasta_prot.header
-  if head =~ ACC_REGEX
-    $1.dup
-  else
-    head.sub(/^>/, '').rstrip
-  end
-end
-# returns the accession number from a reference, or the complete reference
-def accession_from_ref(pep)
-  ref = pep.prot.reference
-  if ref =~ ACC_REGEX
-    $1.dup
-  else
-    ref.rstrip
-  end
-end
-def get_pep_prot_accession(pep)
-  acc = pep.prot.accession
-  if !acc || acc == '0' || acc == 0
-    accession_from_ref(pep)
-  else
-    acc
-  end
-end
-#####################################################################
-# MAIN
-#####################################################################
-opt = OpenStruct.new
-opt.p = 'prob'
-opts = OptionParser.new do |op|
-  op.banner = "usage: #{File.basename(__FILE__)} bioworks.xml true_hits.fasta"
-  op.separator "     [prints to stdout tab delimited table]"
-  op.on('-t', '--ties', 'allow ties on best hit') {|v| opt.t = v }
-  op.on('-p', '--param <s>', 'param: (xcorr | prob)') {|v| opt.p = v}
-end
-opts.parse!
-if ARGV.size < 2
-  puts opts
-  exit
-end
-case opt.p
-when 'prob'
-  param = :peptide_probability
-  best = :first
-when 'xcorr'
-  param = :xcorr
-  best = :last
-else
-  abort "incorrect param: #{opt.p}"
-end
-############################
-# GLOBALS
-DELIM = "\t"
-ACC_REGEX = /\|(.*?)\|/o
-############################
-bioworks = ARGV[0]
-fasta_file = ARGV[1]
-fprots = Fasta.new.read_file(fasta_file).prots
-gi_nums = fprots.map {|prot| get_fasta_accession(prot) }
-peptides = SpecID.new(bioworks).peps
-## Get the best peptide(s) per scan
-top_peps_per_scan = []
-peptides.hash_by(:base_name, :first_scan).each do |bn_scan, pep_array|
-  sorted_list = pep_array.sort_by {|pep| pep.send(param).to_f }
-  top_peps = if best == :first ; [sorted_list.shift] ; else [sorted_list.pop] end
-  found_another = false
-  sorted_list.each do |pep|
-    if pep.send(param).to_f == top_peps.send(best).send(param).to_f
-      if opt.t
-        top_peps << pep
-      else
-        found_another = true
-      end
-    end
-  end
-  unless found_another
-    top_peps_per_scan.push( *top_peps )
-  end
-end
-## Get the best scoring peptide per peptide/prot from list of best
-## peptides/scan
-top_pep_seq_prots = top_peps_per_scan.hash_by {|pep| [pep.sequence, get_pep_prot_accession(pep)] }.map do |k,pep_array|
-  pep_array.sort_by {|pep| pep.send(param).to_f }.send(best)
-end
-## sort the peptides by best score
-sorted_top_pep_seq_prots = top_pep_seq_prots.sort_by {|pep| pep.send(param).to_f }
-if best == :last ; sorted_top_pep_seq_prots.reverse! end
-## plot the probability vs. the number of tps
-puts ['#TPs', param, 'sequence', 'protein accession', 'xcorr'].join(DELIM)
-tps = 0
-sorted_top_pep_seq_prots.each do |pep|
-  if gi_nums.include?( get_pep_prot_accession(pep) )
-    tps += 1
-    puts [tps.to_s, pep.send(param), pep.sequence, get_pep_prot_accession(pep), pep.xcorr].join(DELIM)
-  end
-end

data/script/get_apex_values_rexml.rb DELETED Viewed

@@ -1,44 +0,0 @@
-#!/usr/bin/ruby
-require 'rexml/document'
-if ARGV.size == 0
-  puts "usage: #{File.basename(__FILE__)} <file>-prot.xml ..."
-  puts "outputs a .csv file"
-  exit
-end
-class Protein
-  attr_accessor :name, :pi, :ni
-  def initialize(name, pi, ni)
-    @name, @pi, @ni = name, pi, ni
-  end
-end
-class Listener
-  attr_accessor :proteins
-  def initialize
-    @proteins = []
-  end
-  def tag_start(name, attrs)
-    if name == "protein"
-      protein = Protein.new( attrs['protein_name'], attrs['probability'].to_f, attrs['total_number_peptides'].to_i)
-      @proteins.push( protein )
-    end
-  end
-  def method_missing(*args) ; end
-end
-ARGV.each do |file|
-  File.open("output.csv", 'w') do |out|
-    listener = Listener.new
-    REXML::Document.parse_stream(File.new(file), listener)
-    listener.proteins.sort_by {|prot| [prot.pi, prot.ni, prot.name] }.reverse.each do |protein|
-      out.puts [protein.name, protein.pi, protein.ni].join("\t")
-    end
-  end
-end

data/script/histogram_probs.rb DELETED Viewed

@@ -1,61 +0,0 @@
-#!/usr/bin/ruby
-require 'vec'
-require 'spec_id'
-require 'optparse'
-require 'ostruct'
-require 'set'
-opt = OpenStruct.new
-opt.p = ["INV_"]
-opt.b = 50
-opts = OptionParser.new do |opts|
-  opts.banner = "usage: #{File.basename(__FILE__)} [-d -b bins -p prefix[,...]] file ..."
-  opts.on_head "\noutputs 'histogram.toplot'\n(then) % plot.rb -w lp --yrange n1: --noenhanced histogram.toplot\n"
-  opts.on("-p", "--prefix PREFIX", "(comma sep list) FP protein header prefix (def: #{opt.p})") {|v| opt.p = v.split(',')}
-  opts.on("-b", "--bins NUM_BINS", "number of histogram bins (def: #{opt.b})") {|v| opt.b = v.to_i}
-  opts.on("-d", "--diff", "plots TP - FP") {|v| opt.b = v.to_i}
-end
-opts.parse!
-if ARGV.size < 1
-  puts opts
-end
-outfile = 'histogram.toplot'
-dtype = 'XYData'
-outfile_base = 'histogram'
-title = 'histogram of protein probabilities'
-xaxis = 'probability'
-yaxis = 'frequency'
-out = File.open(outfile, "w")
-[dtype, outfile_base, title, xaxis, yaxis].each do |it|
-  out.puts it
-end
-files = ARGV.to_a
-files.each_with_index do |file,i|
-  fp = VecD.new; tp = VecD.new
-  bio = SpecID.new(file)
-  re = /^#{opt.p[i]}/
-  bio.prots.each do |prot|
-    if prot.reference =~ re
-      fp << Math.log10(prot.probability)
-    else
-      tp << Math.log10(prot.probability)
-    end
-  end
-  if fp.size == 0 then puts "NO FALSE POSITIVES FOUND!  Your prefix is probably wrong ;)" end
-  label = file
-  t_bin, t_freq = tp.histogram(opt.b)
-  f_bin, f_freq = fp.histogram(opt.b)
-  out.puts 'TP ' + label
-  out.puts t_bin.to_s
-  out.puts t_freq.to_s
-  out.puts 'FP ' + label
-  out.puts f_bin.to_s
-  out.puts f_freq.to_s
-end
-out.close