mspire 0.4.9 → 0.5.0
Sign up to get free protection for your applications and to get access to all the features.
- data/README +27 -17
- data/changelog.txt +31 -62
- data/lib/ms/calc.rb +32 -0
- data/lib/ms/data/interleaved.rb +60 -0
- data/lib/ms/data/lazy_io.rb +73 -0
- data/lib/ms/data/lazy_string.rb +15 -0
- data/lib/ms/data/simple.rb +59 -0
- data/lib/ms/data/transposed.rb +41 -0
- data/lib/ms/data.rb +57 -0
- data/lib/ms/format/format_error.rb +12 -0
- data/lib/ms/spectrum.rb +25 -384
- data/lib/ms/support/binary_search.rb +126 -0
- data/lib/ms.rb +10 -10
- metadata +38 -350
- data/INSTALL +0 -58
- data/README.rdoc +0 -18
- data/Rakefile +0 -330
- data/bin/aafreqs.rb +0 -23
- data/bin/bioworks2excel.rb +0 -14
- data/bin/bioworks_to_pepxml.rb +0 -148
- data/bin/bioworks_to_pepxml_gui.rb +0 -225
- data/bin/fasta_shaker.rb +0 -5
- data/bin/filter_and_validate.rb +0 -5
- data/bin/gi2annot.rb +0 -14
- data/bin/id_class_anal.rb +0 -112
- data/bin/id_precision.rb +0 -172
- data/bin/ms_to_lmat.rb +0 -67
- data/bin/pepproph_filter.rb +0 -16
- data/bin/prob_validate.rb +0 -6
- data/bin/protein_summary.rb +0 -6
- data/bin/protxml2prots_peps.rb +0 -32
- data/bin/raw_to_mzXML.rb +0 -55
- data/bin/run_percolator.rb +0 -122
- data/bin/sqt_group.rb +0 -26
- data/bin/srf_group.rb +0 -27
- data/bin/srf_to_sqt.rb +0 -40
- data/lib/align/chams.rb +0 -78
- data/lib/align.rb +0 -154
- data/lib/archive/targz.rb +0 -94
- data/lib/bsearch.rb +0 -120
- data/lib/core_extensions.rb +0 -16
- data/lib/fasta.rb +0 -626
- data/lib/gi.rb +0 -124
- data/lib/group_by.rb +0 -10
- data/lib/index_by.rb +0 -11
- data/lib/merge_deep.rb +0 -21
- data/lib/ms/converter/mzxml.rb +0 -77
- data/lib/ms/gradient_program.rb +0 -170
- data/lib/ms/msrun.rb +0 -244
- data/lib/ms/msrun_index.rb +0 -108
- data/lib/ms/parser/mzdata/axml.rb +0 -67
- data/lib/ms/parser/mzdata/dom.rb +0 -175
- data/lib/ms/parser/mzdata/libxml.rb +0 -7
- data/lib/ms/parser/mzdata.rb +0 -31
- data/lib/ms/parser/mzxml/axml.rb +0 -70
- data/lib/ms/parser/mzxml/dom.rb +0 -182
- data/lib/ms/parser/mzxml/hpricot.rb +0 -253
- data/lib/ms/parser/mzxml/libxml.rb +0 -19
- data/lib/ms/parser/mzxml/regexp.rb +0 -122
- data/lib/ms/parser/mzxml/rexml.rb +0 -72
- data/lib/ms/parser/mzxml/xmlparser.rb +0 -248
- data/lib/ms/parser/mzxml.rb +0 -282
- data/lib/ms/parser.rb +0 -108
- data/lib/ms/precursor.rb +0 -25
- data/lib/ms/scan.rb +0 -81
- data/lib/mspire.rb +0 -4
- data/lib/pi_zero.rb +0 -244
- data/lib/qvalue.rb +0 -161
- data/lib/roc.rb +0 -187
- data/lib/sample_enzyme.rb +0 -160
- data/lib/scan_i.rb +0 -21
- data/lib/spec_id/aa_freqs.rb +0 -170
- data/lib/spec_id/bioworks.rb +0 -497
- data/lib/spec_id/digestor.rb +0 -138
- data/lib/spec_id/mass.rb +0 -179
- data/lib/spec_id/parser/proph.rb +0 -335
- data/lib/spec_id/precision/filter/cmdline.rb +0 -218
- data/lib/spec_id/precision/filter/interactive.rb +0 -134
- data/lib/spec_id/precision/filter/output.rb +0 -148
- data/lib/spec_id/precision/filter.rb +0 -637
- data/lib/spec_id/precision/output.rb +0 -60
- data/lib/spec_id/precision/prob/cmdline.rb +0 -160
- data/lib/spec_id/precision/prob/output.rb +0 -94
- data/lib/spec_id/precision/prob.rb +0 -249
- data/lib/spec_id/proph/pep_summary.rb +0 -104
- data/lib/spec_id/proph/prot_summary.rb +0 -484
- data/lib/spec_id/proph.rb +0 -4
- data/lib/spec_id/protein_summary.rb +0 -489
- data/lib/spec_id/sequest/params.rb +0 -316
- data/lib/spec_id/sequest/pepxml.rb +0 -1458
- data/lib/spec_id/sequest.rb +0 -33
- data/lib/spec_id/sqt.rb +0 -349
- data/lib/spec_id/srf.rb +0 -973
- data/lib/spec_id.rb +0 -778
- data/lib/spec_id_xml.rb +0 -99
- data/lib/transmem/phobius.rb +0 -147
- data/lib/transmem/toppred.rb +0 -368
- data/lib/transmem.rb +0 -157
- data/lib/validator/aa.rb +0 -48
- data/lib/validator/aa_est.rb +0 -112
- data/lib/validator/background.rb +0 -77
- data/lib/validator/bias.rb +0 -95
- data/lib/validator/cmdline.rb +0 -431
- data/lib/validator/decoy.rb +0 -107
- data/lib/validator/digestion_based.rb +0 -70
- data/lib/validator/probability.rb +0 -51
- data/lib/validator/prot_from_pep.rb +0 -234
- data/lib/validator/q_value.rb +0 -32
- data/lib/validator/transmem.rb +0 -272
- data/lib/validator/true_pos.rb +0 -46
- data/lib/validator.rb +0 -197
- data/lib/xml.rb +0 -38
- data/lib/xml_style_parser.rb +0 -119
- data/lib/xmlparser_wrapper.rb +0 -19
- data/release_notes.txt +0 -2
- data/script/compile_and_plot_smriti_final.rb +0 -97
- data/script/create_little_pepxml.rb +0 -61
- data/script/degenerate_peptides.rb +0 -47
- data/script/estimate_fpr_by_cysteine.rb +0 -226
- data/script/extract_gradient_programs.rb +0 -56
- data/script/find_cysteine_background.rb +0 -137
- data/script/genuine_tps_and_probs.rb +0 -136
- data/script/get_apex_values_rexml.rb +0 -44
- data/script/histogram_probs.rb +0 -61
- data/script/mascot_fix_pepxml.rb +0 -123
- data/script/msvis.rb +0 -42
- data/script/mzXML2timeIndex.rb +0 -25
- data/script/peps_per_bin.rb +0 -67
- data/script/prep_dir.rb +0 -121
- data/script/simple_protein_digestion.rb +0 -27
- data/script/smriti_final_analysis.rb +0 -103
- data/script/sqt_to_meta.rb +0 -24
- data/script/top_hit_per_scan.rb +0 -67
- data/script/toppred_to_yaml.rb +0 -47
- data/script/tpp_installer.rb +0 -249
- data/specs/align_spec.rb +0 -79
- data/specs/bin/bioworks_to_pepxml_spec.rb +0 -79
- data/specs/bin/fasta_shaker_spec.rb +0 -259
- data/specs/bin/filter_and_validate__multiple_vals_helper.yaml +0 -199
- data/specs/bin/filter_and_validate_spec.rb +0 -180
- data/specs/bin/ms_to_lmat_spec.rb +0 -34
- data/specs/bin/prob_validate_spec.rb +0 -86
- data/specs/bin/protein_summary_spec.rb +0 -14
- data/specs/fasta_spec.rb +0 -354
- data/specs/gi_spec.rb +0 -22
- data/specs/load_bin_path.rb +0 -7
- data/specs/merge_deep_spec.rb +0 -13
- data/specs/ms/gradient_program_spec.rb +0 -77
- data/specs/ms/msrun_spec.rb +0 -498
- data/specs/ms/parser_spec.rb +0 -92
- data/specs/ms/spectrum_spec.rb +0 -87
- data/specs/pi_zero_spec.rb +0 -115
- data/specs/qvalue_spec.rb +0 -39
- data/specs/roc_spec.rb +0 -251
- data/specs/rspec_autotest.rb +0 -149
- data/specs/sample_enzyme_spec.rb +0 -126
- data/specs/spec_helper.rb +0 -135
- data/specs/spec_id/aa_freqs_spec.rb +0 -52
- data/specs/spec_id/bioworks_spec.rb +0 -148
- data/specs/spec_id/digestor_spec.rb +0 -75
- data/specs/spec_id/precision/filter/cmdline_spec.rb +0 -20
- data/specs/spec_id/precision/filter/output_spec.rb +0 -31
- data/specs/spec_id/precision/filter_spec.rb +0 -246
- data/specs/spec_id/precision/prob_spec.rb +0 -44
- data/specs/spec_id/precision/prob_spec_helper.rb +0 -0
- data/specs/spec_id/proph/pep_summary_spec.rb +0 -98
- data/specs/spec_id/proph/prot_summary_spec.rb +0 -128
- data/specs/spec_id/protein_summary_spec.rb +0 -189
- data/specs/spec_id/sequest/params_spec.rb +0 -68
- data/specs/spec_id/sequest/pepxml_spec.rb +0 -374
- data/specs/spec_id/sequest_spec.rb +0 -38
- data/specs/spec_id/sqt_spec.rb +0 -246
- data/specs/spec_id/srf_spec.rb +0 -172
- data/specs/spec_id/srf_spec_helper.rb +0 -139
- data/specs/spec_id_helper.rb +0 -33
- data/specs/spec_id_spec.rb +0 -366
- data/specs/spec_id_xml_spec.rb +0 -33
- data/specs/transmem/phobius_spec.rb +0 -425
- data/specs/transmem/toppred_spec.rb +0 -298
- data/specs/transmem_spec.rb +0 -60
- data/specs/transmem_spec_shared.rb +0 -64
- data/specs/validator/aa_est_spec.rb +0 -66
- data/specs/validator/aa_spec.rb +0 -40
- data/specs/validator/background_spec.rb +0 -67
- data/specs/validator/bias_spec.rb +0 -122
- data/specs/validator/decoy_spec.rb +0 -51
- data/specs/validator/fasta_helper.rb +0 -26
- data/specs/validator/prot_from_pep_spec.rb +0 -141
- data/specs/validator/transmem_spec.rb +0 -146
- data/specs/validator/true_pos_spec.rb +0 -58
- data/specs/validator_helper.rb +0 -33
- data/specs/xml_spec.rb +0 -12
- data/test_files/000_pepxml18_small.xml +0 -206
- data/test_files/020a.mzXML.timeIndex +0 -4710
- data/test_files/4-03-03_mzXML/000.mzXML.timeIndex +0 -3973
- data/test_files/4-03-03_mzXML/020.mzXML.timeIndex +0 -3872
- data/test_files/4-03-03_small-prot.xml +0 -321
- data/test_files/4-03-03_small.xml +0 -3876
- data/test_files/7MIX_STD_110802_1.sequest_params_fragment.srf +0 -0
- data/test_files/bioworks-3.3_10prots.xml +0 -5999
- data/test_files/bioworks31.params +0 -77
- data/test_files/bioworks32.params +0 -62
- data/test_files/bioworks33.params +0 -63
- data/test_files/bioworks_single_run_small.xml +0 -7237
- data/test_files/bioworks_small.fasta +0 -212
- data/test_files/bioworks_small.params +0 -63
- data/test_files/bioworks_small.phobius +0 -109
- data/test_files/bioworks_small.toppred.out +0 -2847
- data/test_files/bioworks_small.xml +0 -5610
- data/test_files/bioworks_with_INV_small.xml +0 -3753
- data/test_files/bioworks_with_SHUFF_small.xml +0 -2503
- data/test_files/corrupted_900.srf +0 -0
- data/test_files/head_of_7MIX.srf +0 -0
- data/test_files/interact-opd1_mods_small-prot.xml +0 -304
- data/test_files/messups.fasta +0 -297
- data/test_files/opd1/000.my_answer.100lines.xml +0 -101
- data/test_files/opd1/000.tpp_1.2.3.first10.xml +0 -115
- data/test_files/opd1/000.tpp_2.9.2.first10.xml +0 -126
- data/test_files/opd1/000.v2.1.mzXML.timeIndex +0 -3748
- data/test_files/opd1/000_020-prot.png +0 -0
- data/test_files/opd1/000_020_3prots-prot.mod_initprob.xml +0 -62
- data/test_files/opd1/000_020_3prots-prot.xml +0 -62
- data/test_files/opd1/opd1_cat_inv_small-prot.xml +0 -139
- data/test_files/opd1/sequest.3.1.params +0 -77
- data/test_files/opd1/sequest.3.2.params +0 -62
- data/test_files/opd1/twenty_scans.mzXML +0 -418
- data/test_files/opd1/twenty_scans.v2.1.mzXML +0 -382
- data/test_files/opd1/twenty_scans_answ.lmat +0 -0
- data/test_files/opd1/twenty_scans_answ.lmata +0 -9
- data/test_files/opd1_020_beginning.RAW +0 -0
- data/test_files/opd1_2runs_2mods/data/020.mzData.xml +0 -683
- data/test_files/opd1_2runs_2mods/data/020.readw.mzXML +0 -382
- data/test_files/opd1_2runs_2mods/data/040.mzData.xml +0 -683
- data/test_files/opd1_2runs_2mods/data/040.readw.mzXML +0 -382
- data/test_files/opd1_2runs_2mods/data/README.txt +0 -6
- data/test_files/opd1_2runs_2mods/interact-opd1_mods__small.xml +0 -753
- data/test_files/orbitrap_mzData/000_cut.xml +0 -1920
- data/test_files/pepproph_small.xml +0 -4691
- data/test_files/phobius.small.noheader.txt +0 -50
- data/test_files/phobius.small.small.txt +0 -53
- data/test_files/s01_anC1_ld020mM.key.txt +0 -25
- data/test_files/s01_anC1_ld020mM.meth +0 -0
- data/test_files/small.fasta +0 -297
- data/test_files/small.sqt +0 -87
- data/test_files/smallraw.RAW +0 -0
- data/test_files/tf_bioworks2excel.bioXML +0 -14340
- data/test_files/tf_bioworks2excel.txt.actual +0 -1035
- data/test_files/toppred.small.out +0 -416
- data/test_files/toppred.xml.out +0 -318
- data/test_files/validator_hits_separate/bias_bioworks_small_HS.fasta +0 -7
- data/test_files/validator_hits_separate/bioworks_small_HS.xml +0 -5651
- data/test_files/yeast_gly_small-prot.xml +0 -265
- data/test_files/yeast_gly_small.1.0_1.0_1.0.parentTimes +0 -6
- data/test_files/yeast_gly_small.xml +0 -3807
- data/test_files/yeast_gly_small2.parentTimes +0 -6
data/script/mascot_fix_pepxml.rb
DELETED
@@ -1,123 +0,0 @@
|
|
1
|
-
#!/usr/bin/ruby
|
2
|
-
|
3
|
-
require 'rubygems'
|
4
|
-
require 'ms/msrun'
|
5
|
-
gem 'axml', '= 0.0.2'
|
6
|
-
|
7
|
-
# returns an array containing one or two pairs of [cycle_num, time] that
|
8
|
-
# represent the lowest and highest cycle numbers coupled to lowest and highest
|
9
|
-
# time (in seconds) and the lowest and highest associated experiment numbers
|
10
|
-
def get_cycle_exp_time_triplets(string)
|
11
|
-
hash = {}
|
12
|
-
cycle_index = nil
|
13
|
-
ssplit = string.split(', ')
|
14
|
-
ssplit.each_with_index do |piece,i|
|
15
|
-
if piece =~ /^Cycle\(s\):/
|
16
|
-
cycle_index = i
|
17
|
-
break
|
18
|
-
end
|
19
|
-
end
|
20
|
-
cycle_info = ssplit[cycle_index..-1].join(", ")
|
21
|
-
#Cycle(s): 663, 675 (Experiment 2), 667 (Experiment 4)
|
22
|
-
(header, info) = cycle_info.split(': ')
|
23
|
-
cycles = []
|
24
|
-
cycle_exp_pairs = []
|
25
|
-
info.split('), ').each do |a|
|
26
|
-
(nums, exp_num) = a.split('(')
|
27
|
-
nums = nums.split(', ').map {|v| v.to_i }
|
28
|
-
exp_num = exp_num.split(' ').last.sub(/\)$/,'').to_i
|
29
|
-
nums.each {|v| cycle_exp_pairs << [v, exp_num] }
|
30
|
-
end
|
31
|
-
|
32
|
-
min = cycle_exp_pairs.min
|
33
|
-
max = cycle_exp_pairs.max
|
34
|
-
|
35
|
-
elution = ssplit.select {|v| v.match(/^Elution:(.*)/) }.first
|
36
|
-
times = elution.split(': ').last
|
37
|
-
times.sub!(/ min$/,'')
|
38
|
-
times = times.split(' to ')
|
39
|
-
times.map! do |v|
|
40
|
-
(minutes, minute_decimals) = v.split('.')
|
41
|
-
seconds = minutes.to_f * 60
|
42
|
-
seconds + ( minute_decimals.to_f * 60 / 100 )
|
43
|
-
end
|
44
|
-
|
45
|
-
if max == min
|
46
|
-
[[min.first, min.last, times.first]]
|
47
|
-
else
|
48
|
-
[[min.first, min.last, times.first], [max.first, max.last, times.last]]
|
49
|
-
end
|
50
|
-
end
|
51
|
-
|
52
|
-
def get_scan_num(cycle, cycle_time, time_to_scan_num)
|
53
|
-
# grossly inefficient, but guaranteed to get right answer!
|
54
|
-
below_scan = nil
|
55
|
-
time_to_scan_num.each do |scan_time, scan_num|
|
56
|
-
if scan_time < cycle_time
|
57
|
-
below_scan = scan_num
|
58
|
-
else
|
59
|
-
break # scan_time > cycle_time
|
60
|
-
end
|
61
|
-
end
|
62
|
-
below_scan
|
63
|
-
end
|
64
|
-
|
65
|
-
#####################################################
|
66
|
-
# MAIN:
|
67
|
-
#####################################################
|
68
|
-
|
69
|
-
additional_ext = ".with_scan_nums"
|
70
|
-
|
71
|
-
if ARGV.size != 2
|
72
|
-
puts "usage: #{File.basename(__FILE__)} <file>.pepXML <file>.mzXML"
|
73
|
-
puts ""
|
74
|
-
puts "uses information from the mzXML file to fix the pepXML file"
|
75
|
-
puts "(adds in msms_run_summary: 'base_name' and 'raw_data' attributes;"
|
76
|
-
puts " adds scan numbers based on cycle and experiment times)"
|
77
|
-
puts ""
|
78
|
-
puts "outputs: <file>#{additional_ext}.pepXML"
|
79
|
-
exit
|
80
|
-
end
|
81
|
-
|
82
|
-
# get time_to_scan_num for msLevel=1 from the mzXML file
|
83
|
-
(pepxml, mzxml) = ARGV
|
84
|
-
mzxml_basename = File.basename(mzxml).sub(/\.mzxml$/i, '')
|
85
|
-
|
86
|
-
ext = File.extname(pepxml)
|
87
|
-
output = pepxml.sub(Regexp.new(Regexp.escape(ext)), additional_ext + ext)
|
88
|
-
|
89
|
-
ms = MS::MSRun.new(mzxml, :lazy => :no_spectra)
|
90
|
-
time_to_scan_num = ms.scans.select {|scan| scan.ms_level == 1 }.map do |scan|
|
91
|
-
[scan.time, scan.num]
|
92
|
-
end
|
93
|
-
|
94
|
-
# update spectrum queries based on scan number
|
95
|
-
|
96
|
-
root = AXML.parse_file(pepxml)
|
97
|
-
# fix the basename stuff:
|
98
|
-
msms_r_summary_n = root.child
|
99
|
-
atts = msms_r_summary_n.attrs
|
100
|
-
atts['base_name'] = mzxml_basename
|
101
|
-
atts['raw_data'] = '.mzXML'
|
102
|
-
|
103
|
-
root.child.find("child::spectrum_query").each do |sq|
|
104
|
-
triplets = get_cycle_exp_time_triplets(sq['spectrum'])
|
105
|
-
triplets.map! do |triplet|
|
106
|
-
[get_scan_num(triplet[0], triplet[2], time_to_scan_num), *triplet]
|
107
|
-
end
|
108
|
-
# [scan_num, cycle, exp, time]
|
109
|
-
quad = triplets.first
|
110
|
-
first_scan_num = (quad[0] + quad[2] - 1)
|
111
|
-
sq.attrs['start_scan'] = first_scan_num.to_s
|
112
|
-
sq.attrs['end_scan'] =
|
113
|
-
if triplets.size > 1
|
114
|
-
quad = triplets.last
|
115
|
-
(quad[0] + quad[2] - 1).to_s
|
116
|
-
else
|
117
|
-
first_scan_num.to_s
|
118
|
-
end
|
119
|
-
end
|
120
|
-
|
121
|
-
xml_header = '<?xml version="1.0" encoding="UTF-8"?>'
|
122
|
-
File.open(output, 'w') {|out| out.puts(xml_header); out.print root.to_s }
|
123
|
-
|
data/script/msvis.rb
DELETED
@@ -1,42 +0,0 @@
|
|
1
|
-
#!/usr/bin/ruby
|
2
|
-
|
3
|
-
|
4
|
-
options_file = "local.cfg"
|
5
|
-
|
6
|
-
moving_options_file = false
|
7
|
-
mv_options_file = ""
|
8
|
-
if File.exist?(options_file)
|
9
|
-
mv_options_file = options_file + ".backup"
|
10
|
-
File.rename(options_file, mv_options_file)
|
11
|
-
moving_options_file = true
|
12
|
-
end
|
13
|
-
|
14
|
-
filetype = "msmat"
|
15
|
-
files = ARGV.to_a
|
16
|
-
|
17
|
-
base = "Msvis_filename"
|
18
|
-
|
19
|
-
if files.size == 0
|
20
|
-
puts "msvis.rb file.msmat ..."
|
21
|
-
puts "right now only creates a local.cfg file"
|
22
|
-
exit
|
23
|
-
end
|
24
|
-
|
25
|
-
File.open(options_file, "w") do |fh|
|
26
|
-
fh.print "Msvis_filetype = " + filetype + "\n"
|
27
|
-
fh.print "Msvis_num = " + files.size.to_s + "\n"
|
28
|
-
cnt = 0
|
29
|
-
files.each do |file|
|
30
|
-
fh.print( base + cnt.to_s + " = " + "\"#{file}\"" + "\n" )
|
31
|
-
cnt += 1
|
32
|
-
end
|
33
|
-
end
|
34
|
-
|
35
|
-
#exec "./msvis"
|
36
|
-
|
37
|
-
#File.unlink options_file
|
38
|
-
|
39
|
-
#if moving_options_file
|
40
|
-
# File.rename mv_options_file, options_file
|
41
|
-
#end
|
42
|
-
|
data/script/mzXML2timeIndex.rb
DELETED
@@ -1,25 +0,0 @@
|
|
1
|
-
#!/usr/bin/ruby -w
|
2
|
-
|
3
|
-
require 'spec/mzxml/parser'
|
4
|
-
require 'spec/msrun'
|
5
|
-
require 'rexml/document'
|
6
|
-
include REXML
|
7
|
-
|
8
|
-
if ARGV.size < 1
|
9
|
-
puts "usage: #{File.basename(__FILE__)} file.mzXML ..."
|
10
|
-
puts " outputs 'file.mzXML.timeIndex'"
|
11
|
-
puts " which contains rows of:"
|
12
|
-
puts " level scan_num time (if !msLevel1:) prec_mz prec_intensity"
|
13
|
-
end
|
14
|
-
|
15
|
-
# outputs rows of:
|
16
|
-
# level scan_num time [precursor_mz precursor_intensity(if !msLevel1)]
|
17
|
-
|
18
|
-
ARGV.each do |file|
|
19
|
-
puts "READING: " + file
|
20
|
-
outfile = file + '.timeIndex'
|
21
|
-
obj = MS::MSRunIndex.new(file)
|
22
|
-
puts "WRITING: " + outfile
|
23
|
-
obj.to_index_file(outfile)
|
24
|
-
end
|
25
|
-
|
data/script/peps_per_bin.rb
DELETED
@@ -1,67 +0,0 @@
|
|
1
|
-
#!/usr/bin/ruby -w
|
2
|
-
|
3
|
-
require 'generator'
|
4
|
-
require 'optparse'
|
5
|
-
|
6
|
-
require 'fasta'
|
7
|
-
require 'sample_enzyme'
|
8
|
-
require 'spec_id/digestor'
|
9
|
-
require 'spec_id/mass'
|
10
|
-
require 'vec'
|
11
|
-
|
12
|
-
opt = {}
|
13
|
-
opt[:missed_cleavages] = 0 # ~ parts per million
|
14
|
-
opt[:bin_size] = 0.001 # ~ parts per million
|
15
|
-
opt[:min] = 300.0
|
16
|
-
opt[:max] = 4500.0
|
17
|
-
opt[:h_plus] = 1.0
|
18
|
-
|
19
|
-
opts = OptionParser.new do |op|
|
20
|
-
op.banner = "usage: #{File.basename(__FILE__)} *.fasta"
|
21
|
-
op.separator "Outputs a close estimate of number of peptides per bin."
|
22
|
-
op.separator "Uses m+H+ as the peptide mass."
|
23
|
-
op.separator "[for speed, assumes that there is a peptide mass close to the extremes]"
|
24
|
-
op.on("-b", "--bin_size <F>", Float, "size of bins [#{opt[:bin_size]}]") {|v| opt[:bin_size] = v }
|
25
|
-
op.on("-x", "--max <F>", Float, "max mass to accept [#{opt[:max]}]") {|v| opt[:max] = v }
|
26
|
-
op.on("-n", "--min <F>", Float, "min mass to accept [#{opt[:min]}]") {|v| opt[:min] = v }
|
27
|
-
op.on("-h", "--h_plus <F>", Float, "value of H+ to use [#{opt[:h_plus]}]") {|v| opt[:h_plus] = v }
|
28
|
-
op.on("-m", "--missed_cleavages <N>", Integer, "num missed cleavages [#{opt[:missed_cleavages]}]") {|v| opt[:missed_cleavages] = v }
|
29
|
-
end
|
30
|
-
|
31
|
-
opts.parse!
|
32
|
-
|
33
|
-
if ARGV.size == 0
|
34
|
-
puts opts.to_s
|
35
|
-
exit
|
36
|
-
end
|
37
|
-
|
38
|
-
min_mass = opt[:min]
|
39
|
-
max_mass = opt[:max]
|
40
|
-
|
41
|
-
ARGV.each do |file|
|
42
|
-
fasta = Fasta.new(file)
|
43
|
-
uniq_aaseqs = fasta.map do |prot|
|
44
|
-
SampleEnzyme.tryptic(prot.aaseq, opt[:missed_cleavages])
|
45
|
-
end.flatten.uniq
|
46
|
-
|
47
|
-
masses = Mass::Calculator.new(Mass::MONO, opt[:h_plus]).masses(uniq_aaseqs)
|
48
|
-
passing_masses = Mass::Calculator.new(Mass::MONO, opt[:h_plus]).masses(uniq_aaseqs).select do |mh|
|
49
|
-
((mh >= min_mass) and (mh <= max_mass))
|
50
|
-
end
|
51
|
-
|
52
|
-
## warn if the masses aren't close to the end points
|
53
|
-
if (max_mass - passing_masses.max) > 1.0
|
54
|
-
warn "highest mass is not that close to max: #{passing_masses.max}"
|
55
|
-
end
|
56
|
-
if (passing_masses.min - min_mass) > 1.0
|
57
|
-
warn "lowest mass is not that close to min: #{passing_masses.min}"
|
58
|
-
end
|
59
|
-
|
60
|
-
num_bins = (max_mass - min_mass) / opt[:bin_size]
|
61
|
-
|
62
|
-
(bins, freqs) = VecD.new(passing_masses).histogram(num_bins)
|
63
|
-
|
64
|
-
# report
|
65
|
-
puts "#{file}: #{freqs.avg}"
|
66
|
-
|
67
|
-
end
|
data/script/prep_dir.rb
DELETED
@@ -1,121 +0,0 @@
|
|
1
|
-
#!/usr/bin/ruby -w
|
2
|
-
|
3
|
-
|
4
|
-
## Assumes the same base
|
5
|
-
module Kernel
|
6
|
-
|
7
|
-
@@remove_raw = [/flush/, /equil/, /To_sequest/, /to_sequest/, /TempSequence/]
|
8
|
-
@@seqext = '.sequest.zip'
|
9
|
-
@@rawext = ['.RAW.zip', '.raw.zip']
|
10
|
-
|
11
|
-
## gets the basename of a file like this filename.RAW.zip or filename.raw.zip
|
12
|
-
def get_basename(zip_file)
|
13
|
-
basename = ""
|
14
|
-
try1 = File.basename(zip_file, @@rawext[0])
|
15
|
-
try2 = File.basename(zip_file, @@rawext[1])
|
16
|
-
|
17
|
-
if try1.size < try2.size
|
18
|
-
basename = try1
|
19
|
-
elsif try1.size > try2.size
|
20
|
-
basename = try2
|
21
|
-
else #they are equal
|
22
|
-
puts "something wrong at the basename"
|
23
|
-
exit(1)
|
24
|
-
end
|
25
|
-
basename
|
26
|
-
end
|
27
|
-
def remove_extra_raw
|
28
|
-
Dir.new(Dir.getwd).each do |test|
|
29
|
-
@@remove_raw.each do |try|
|
30
|
-
if test =~ try
|
31
|
-
puts "removing " + test
|
32
|
-
File.unlink test
|
33
|
-
end
|
34
|
-
end
|
35
|
-
end
|
36
|
-
end
|
37
|
-
|
38
|
-
def raw2mzXML
|
39
|
-
system "raw2mzXML.pl *.RAW"
|
40
|
-
end
|
41
|
-
|
42
|
-
def get_sequest_params(seqfile)
|
43
|
-
unless File.exist?(seqfile)
|
44
|
-
puts "couldn't find #{seqfile}"
|
45
|
-
exit
|
46
|
-
end
|
47
|
-
basename = get_seq_basename(seqfile)
|
48
|
-
extracted = basename + "/sequest.params"
|
49
|
-
system "unzip #{seqzip} #{extracted}"
|
50
|
-
return extracted
|
51
|
-
end
|
52
|
-
|
53
|
-
def get_seq_basename(file)
|
54
|
-
File.basename(file, @@seqext)
|
55
|
-
end
|
56
|
-
|
57
|
-
end
|
58
|
-
|
59
|
-
if ARGV.size < 1
|
60
|
-
puts "usage: #{File.basename(__FILE__)} file.raw.zip"
|
61
|
-
puts "This is specific to Peng's data to prepare it for OPD"
|
62
|
-
exit
|
63
|
-
end
|
64
|
-
|
65
|
-
rawfiles = []
|
66
|
-
seqfiles = []
|
67
|
-
ARGV.each do |try|
|
68
|
-
if try =~ /\.raw\.zip/
|
69
|
-
rawfiles.push(try)
|
70
|
-
elsif try =~ /\.sequest\.zip/
|
71
|
-
seqfiles.push(try)
|
72
|
-
else
|
73
|
-
puts "skipping " + try
|
74
|
-
end
|
75
|
-
end
|
76
|
-
|
77
|
-
|
78
|
-
## depends on them being alphebetical
|
79
|
-
(0..(rawfiles.size)).each do |cnt|
|
80
|
-
rawfile = rawfiles[cnt]
|
81
|
-
seqfile = seqfiles[cnt]
|
82
|
-
break unless rawfile
|
83
|
-
raw_basename = get_basename(rawfile)
|
84
|
-
system("unzip #{rawfile}")
|
85
|
-
puts "Basename: " + raw_basename
|
86
|
-
current_dir = Dir.getwd
|
87
|
-
unless Dir.chdir(raw_basename)
|
88
|
-
puts "can't change to #{raw_basename}"
|
89
|
-
exit
|
90
|
-
end
|
91
|
-
remove_extra_raw
|
92
|
-
raw2mzXML
|
93
|
-
system("mkdir raw")
|
94
|
-
system("mkdir mzxml")
|
95
|
-
system('mv *.RAW raw/')
|
96
|
-
system('mv *.mzXML mzxml/')
|
97
|
-
Dir.chdir(current_dir)
|
98
|
-
rawzip = raw_basename + '.raw.zip'
|
99
|
-
mzxmlzip = raw_basename + '.mzxml.zip'
|
100
|
-
system("zip -r #{rawzip} #{raw_basename}/raw/*")
|
101
|
-
system("zip -r #{mzxmlzip} #{raw_basename}/mzxml/*")
|
102
|
-
system("mv #{rawzip} #{raw_basename}")
|
103
|
-
system("mv #{mzxmlzip} #{raw_basename}")
|
104
|
-
Dir.chdir(raw_basename)
|
105
|
-
if (Dir.glob("*.zip").size == 2)
|
106
|
-
system("rm -rf raw")
|
107
|
-
system("rm -rf mzxml")
|
108
|
-
end
|
109
|
-
Dir.chdir current_dir
|
110
|
-
|
111
|
-
## get the sequest.params file:
|
112
|
-
extracted = get_sequest_params(seqfile)
|
113
|
-
system("mv #{extracted} #{raw_basename}")
|
114
|
-
|
115
|
-
## move the sequest file in
|
116
|
-
system("chmod 664 #{seqfile}")
|
117
|
-
system("mv #{seqfile} #{raw_basename}")
|
118
|
-
|
119
|
-
end
|
120
|
-
|
121
|
-
|
@@ -1,27 +0,0 @@
|
|
1
|
-
#!/usr/bin/ruby -w
|
2
|
-
|
3
|
-
require 'fasta'
|
4
|
-
require 'sample_enzyme'
|
5
|
-
|
6
|
-
if ARGV.size < 2
|
7
|
-
puts "usage: #{File.basename(__FILE__)} missed_cleavages <file>.fasta ..."
|
8
|
-
puts " returns <file>.missed_cleavages_<missed_cleavages>.peptides"
|
9
|
-
abort
|
10
|
-
end
|
11
|
-
|
12
|
-
missed_cleavages = ARGV.shift.to_i
|
13
|
-
|
14
|
-
ARGV.each do |file|
|
15
|
-
|
16
|
-
if file !~ /\.fasta/
|
17
|
-
abort "must be a fasta file with extension fasta"
|
18
|
-
end
|
19
|
-
new_filename = file.sub(/\.fasta$/, '')
|
20
|
-
new_filename << ".missed_cleavages_#{missed_cleavages}.peptides"
|
21
|
-
File.open(new_filename, "w") do |fh|
|
22
|
-
peptides = []
|
23
|
-
Fasta.new.read_file(file).prots.each do |prot|
|
24
|
-
fh.puts( prot.header.split(/\s+/).first.sub(/^>/,'') + "\t" + SampleEnzyme.tryptic(prot.aaseq, missed_cleavages).join(" ") )
|
25
|
-
end
|
26
|
-
end
|
27
|
-
end
|
@@ -1,103 +0,0 @@
|
|
1
|
-
#!/usr/bin/ruby -w
|
2
|
-
|
3
|
-
require 'spec_id'
|
4
|
-
require 'fasta'
|
5
|
-
require 'optparse'
|
6
|
-
|
7
|
-
$top = false
|
8
|
-
opts = OptionParser.new do |op|
|
9
|
-
op.banner = "usage: #{File.basename(__FILE__)} bioworks.xml <file>.fasta|prefix"
|
10
|
-
op.separator "outputs stdout (tab del sorted by probability) probability, file:aaseq:charge T/F"
|
11
|
-
op.separator "hashes on file+aaseq+charge"
|
12
|
-
op.on("-t", "--top", "only top peptide (by prob) per scan+charge") do
|
13
|
-
$top = true
|
14
|
-
end
|
15
|
-
end
|
16
|
-
|
17
|
-
opts.parse!
|
18
|
-
|
19
|
-
if ARGV.size < 2
|
20
|
-
puts opts.to_s
|
21
|
-
exit
|
22
|
-
end
|
23
|
-
|
24
|
-
specid_file = ARGV.shift
|
25
|
-
file_or_prefix = ARGV.shift
|
26
|
-
|
27
|
-
specid = SpecID.new(specid_file)
|
28
|
-
|
29
|
-
indicator =
|
30
|
-
if File.exist? file_or_prefix
|
31
|
-
Fasta.new.read_file(file_or_prefix)
|
32
|
-
else
|
33
|
-
file_or_prefix
|
34
|
-
end
|
35
|
-
|
36
|
-
|
37
|
-
# returns an array containing the min prob peptides (in case of a tie)
|
38
|
-
def lowest_peps(ar)
|
39
|
-
min_prob = ar.min {|a,b| a.probability.to_f <=> b.probability.to_f }.probability.to_f
|
40
|
-
ar.select {|v| v.probability.to_f == min_prob }
|
41
|
-
end
|
42
|
-
|
43
|
-
peps = specid.peps
|
44
|
-
if $top
|
45
|
-
top_by_scan = []
|
46
|
-
peps.hash_by(:base_name, :first_scan).each do |k,v|
|
47
|
-
low_peps = lowest_peps(v)
|
48
|
-
top_by_scan.push( *low_peps )
|
49
|
-
end
|
50
|
-
end
|
51
|
-
|
52
|
-
results = top_by_scan.hash_by(:base_name, :aaseq, :charge).map do |k,v|
|
53
|
-
low_peps = lowest_peps(v)
|
54
|
-
#min_pep = v.min {|a,b| a.probability.to_f <=> b.probability.to_f }
|
55
|
-
all_prots = []
|
56
|
-
low_peps.each do |pep|
|
57
|
-
all_prot_references.push( *(pep.prots.map {|v| v.reference }) )
|
58
|
-
end
|
59
|
-
all_prot_references.uniq!
|
60
|
-
is_true =
|
61
|
-
if indicator.is_a? Fasta
|
62
|
-
all_prot_references.any? do |ref|
|
63
|
-
indicator.included_in_header?(ref)
|
64
|
-
end
|
65
|
-
else
|
66
|
-
!(all_prot_references.all? {|ref| ref.include?( indicator )})
|
67
|
-
end
|
68
|
-
[min_pep.probability.to_f, k, is_true]
|
69
|
-
end
|
70
|
-
|
71
|
-
results.sort.each do |result|
|
72
|
-
report = [result[0], result[1].join(':'), (result[2] ? 'T' : 'F')]
|
73
|
-
puts report.join("\t")
|
74
|
-
end
|
75
|
-
|
76
|
-
=begin
|
77
|
-
# ORIGINAL CODE
|
78
|
-
peps = specid.peps
|
79
|
-
if $top
|
80
|
-
peps = peps.hash_by(:base_name, :first_scan).map do |k,v|
|
81
|
-
v.min {|a,b| a.probability.to_f <=> b.probability.to_f }
|
82
|
-
end
|
83
|
-
end
|
84
|
-
|
85
|
-
results = peps.hash_by(:base_name, :aaseq, :charge).map do |k,v|
|
86
|
-
min_pep = v.min {|a,b| a.probability.to_f <=> b.probability.to_f }
|
87
|
-
references = min_pep.prots.map {|v| v.reference }.uniq
|
88
|
-
is_true =
|
89
|
-
if indicator.is_a? Fasta
|
90
|
-
references.any? do |ref|
|
91
|
-
indicator.included_in_header?(ref)
|
92
|
-
end
|
93
|
-
else
|
94
|
-
!(references.all? {|ref| ref.include?( indicator )})
|
95
|
-
end
|
96
|
-
[min_pep.probability.to_f, k, is_true]
|
97
|
-
end
|
98
|
-
|
99
|
-
results.sort.each do |result|
|
100
|
-
report = [result[0], result[1].join(':'), (result[2] ? 'T' : 'F')]
|
101
|
-
puts report.join("\t")
|
102
|
-
end
|
103
|
-
=end
|
data/script/sqt_to_meta.rb
DELETED
@@ -1,24 +0,0 @@
|
|
1
|
-
#!/usr/bin/ruby -s
|
2
|
-
|
3
|
-
require 'optparse'
|
4
|
-
|
5
|
-
$outfile = 'meta.sqm'
|
6
|
-
opts = OptionParser.new do |op|
|
7
|
-
op.banner = "usage: #{File.basename(__FILE__)} <file>.sqt ..."
|
8
|
-
op.separator "outputs meta.sqm (a sqt meta file)"
|
9
|
-
op.on("-o", "--outfile <file>", "currently: #{$outfile}") {|v| $outfile = v}
|
10
|
-
end
|
11
|
-
|
12
|
-
opts.parse!
|
13
|
-
|
14
|
-
if ARGV.size == 0
|
15
|
-
puts opts.to_s
|
16
|
-
exit
|
17
|
-
end
|
18
|
-
|
19
|
-
File.open($outfile, 'w') do |out|
|
20
|
-
ARGV.each do |file|
|
21
|
-
out.puts File.expand_path(file)
|
22
|
-
end
|
23
|
-
end
|
24
|
-
|
data/script/top_hit_per_scan.rb
DELETED
@@ -1,67 +0,0 @@
|
|
1
|
-
#!/usr/bin/ruby -w
|
2
|
-
|
3
|
-
###################################################################
|
4
|
-
cats = %w(base_name sequence xcorr deltacn first_scan last_scan)
|
5
|
-
###################################################################
|
6
|
-
|
7
|
-
require 'spec_id'
|
8
|
-
require 'hash_by'
|
9
|
-
|
10
|
-
extension_top = '.top_per_scan.txt'
|
11
|
-
extension_all = '.all_peps_per_scan.txt'
|
12
|
-
|
13
|
-
if ARGV.size < 1
|
14
|
-
puts "usage: #{File.basename(__FILE__)} <file>.xml"
|
15
|
-
puts "output: <file>#{extension}"
|
16
|
-
puts ""
|
17
|
-
puts "Generates top hit (highest xcorr) per scan."
|
18
|
-
exit
|
19
|
-
end
|
20
|
-
|
21
|
-
def print_doc(outfile, headers, table_a_of_a)
|
22
|
-
document = table_a_of_a.map do |line|
|
23
|
-
line.join("\t")
|
24
|
-
end.join("\n")
|
25
|
-
File.open(outfile, 'w') do |out|
|
26
|
-
out.print headers.join("\t") + "\n"
|
27
|
-
out.print document
|
28
|
-
end
|
29
|
-
end
|
30
|
-
|
31
|
-
|
32
|
-
def pep_array_to_table(peps, send_to)
|
33
|
-
arr_of_arr = peps.map do |pep|
|
34
|
-
arr = send_to.map {|sym| pep.send(sym) }
|
35
|
-
arr.unshift( pep.prot.reference ) # hacked on
|
36
|
-
end
|
37
|
-
end
|
38
|
-
|
39
|
-
###############################################
|
40
|
-
# MAIN:
|
41
|
-
###############################################
|
42
|
-
|
43
|
-
file = ARGV[0]
|
44
|
-
outfile_top = file.sub(/\.xml$/, extension_top)
|
45
|
-
outfile_all = file.sub(/\.xml$/, extension_all)
|
46
|
-
|
47
|
-
sp = SpecID.new(file)
|
48
|
-
|
49
|
-
# The old (incorrect version)
|
50
|
-
# pep_hash = sp.peps.hash_by(:first_scan, :last_scan)
|
51
|
-
# The correct version:
|
52
|
-
pep_hash = sp.peps.hash_by(:base_name, :first_scan, :last_scan)
|
53
|
-
top_per_scan = pep_hash.map {|k,v| v.sort_by {|ob| ob.xcorr.to_f }.last }
|
54
|
-
top_per_scan = top_per_scan.sort_by {|pep| pep.first_scan.to_i }
|
55
|
-
|
56
|
-
all_peps = sp.peps.sort_by do |pep| [pep.first_scan.to_i, -1.0 * pep.xcorr.to_f] end
|
57
|
-
|
58
|
-
cats_sym = cats.map {|v| v.to_sym }
|
59
|
-
|
60
|
-
a_of_a_top = pep_array_to_table(top_per_scan, cats_sym)
|
61
|
-
a_of_a_all = pep_array_to_table(all_peps, cats_sym)
|
62
|
-
|
63
|
-
cats.unshift "protein_reference"
|
64
|
-
|
65
|
-
print_doc(outfile_top, cats, a_of_a_top)
|
66
|
-
print_doc(outfile_all, cats, a_of_a_all)
|
67
|
-
|
data/script/toppred_to_yaml.rb
DELETED
@@ -1,47 +0,0 @@
|
|
1
|
-
#!/usr/bin/ruby -w
|
2
|
-
|
3
|
-
|
4
|
-
require 'optparse'
|
5
|
-
|
6
|
-
opt = {}
|
7
|
-
opt[:probability] = 1.0
|
8
|
-
opts = OptionParser.new do |op|
|
9
|
-
op.banner = "USAGE: #{File.basename(__FILE__)} toppred.out"
|
10
|
-
op.separator "Outputs toppred.yaml"
|
11
|
-
op.separator "takes the highest probability structure"
|
12
|
-
op.separator "for best structures of equal probability, takes first given"
|
13
|
-
op.separator "Each line contains:"
|
14
|
-
op.separator "<identifier>: String :"
|
15
|
-
op.separator " num_found: Int"
|
16
|
-
op.separator " num_certain_transmembrane_segments: Int"
|
17
|
-
op.separator " num_putative_transmembrane_segments: Int"
|
18
|
-
op.separator " best_structure_probability: Float"
|
19
|
-
op.separator " transmembrane_segments:"
|
20
|
-
op.separator " - probability: Float"
|
21
|
-
op.separator " start: Int"
|
22
|
-
op.separator " stop: Int"
|
23
|
-
op.separator " aaseq: String"
|
24
|
-
op.separator ""
|
25
|
-
op.separator "OPTIONS:"
|
26
|
-
op.on("-p", "--probability", Float, "min structure prob threshold (default #{opt[:probability]})") {|v| opt[:probability] = v}
|
27
|
-
end
|
28
|
-
|
29
|
-
opts.parse!
|
30
|
-
|
31
|
-
|
32
|
-
if ARGV.size == 0
|
33
|
-
puts opts
|
34
|
-
exit
|
35
|
-
end
|
36
|
-
|
37
|
-
file = ARGV.shift
|
38
|
-
|
39
|
-
File.open(file) do |fh|
|
40
|
-
hash = Transmem.read_toppred(fh)
|
41
|
-
end
|
42
|
-
|
43
|
-
puts hash.to_yaml
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|