mspire 0.4.9 → 0.5.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (255) hide show
  1. data/README +27 -17
  2. data/changelog.txt +31 -62
  3. data/lib/ms/calc.rb +32 -0
  4. data/lib/ms/data/interleaved.rb +60 -0
  5. data/lib/ms/data/lazy_io.rb +73 -0
  6. data/lib/ms/data/lazy_string.rb +15 -0
  7. data/lib/ms/data/simple.rb +59 -0
  8. data/lib/ms/data/transposed.rb +41 -0
  9. data/lib/ms/data.rb +57 -0
  10. data/lib/ms/format/format_error.rb +12 -0
  11. data/lib/ms/spectrum.rb +25 -384
  12. data/lib/ms/support/binary_search.rb +126 -0
  13. data/lib/ms.rb +10 -10
  14. metadata +38 -350
  15. data/INSTALL +0 -58
  16. data/README.rdoc +0 -18
  17. data/Rakefile +0 -330
  18. data/bin/aafreqs.rb +0 -23
  19. data/bin/bioworks2excel.rb +0 -14
  20. data/bin/bioworks_to_pepxml.rb +0 -148
  21. data/bin/bioworks_to_pepxml_gui.rb +0 -225
  22. data/bin/fasta_shaker.rb +0 -5
  23. data/bin/filter_and_validate.rb +0 -5
  24. data/bin/gi2annot.rb +0 -14
  25. data/bin/id_class_anal.rb +0 -112
  26. data/bin/id_precision.rb +0 -172
  27. data/bin/ms_to_lmat.rb +0 -67
  28. data/bin/pepproph_filter.rb +0 -16
  29. data/bin/prob_validate.rb +0 -6
  30. data/bin/protein_summary.rb +0 -6
  31. data/bin/protxml2prots_peps.rb +0 -32
  32. data/bin/raw_to_mzXML.rb +0 -55
  33. data/bin/run_percolator.rb +0 -122
  34. data/bin/sqt_group.rb +0 -26
  35. data/bin/srf_group.rb +0 -27
  36. data/bin/srf_to_sqt.rb +0 -40
  37. data/lib/align/chams.rb +0 -78
  38. data/lib/align.rb +0 -154
  39. data/lib/archive/targz.rb +0 -94
  40. data/lib/bsearch.rb +0 -120
  41. data/lib/core_extensions.rb +0 -16
  42. data/lib/fasta.rb +0 -626
  43. data/lib/gi.rb +0 -124
  44. data/lib/group_by.rb +0 -10
  45. data/lib/index_by.rb +0 -11
  46. data/lib/merge_deep.rb +0 -21
  47. data/lib/ms/converter/mzxml.rb +0 -77
  48. data/lib/ms/gradient_program.rb +0 -170
  49. data/lib/ms/msrun.rb +0 -244
  50. data/lib/ms/msrun_index.rb +0 -108
  51. data/lib/ms/parser/mzdata/axml.rb +0 -67
  52. data/lib/ms/parser/mzdata/dom.rb +0 -175
  53. data/lib/ms/parser/mzdata/libxml.rb +0 -7
  54. data/lib/ms/parser/mzdata.rb +0 -31
  55. data/lib/ms/parser/mzxml/axml.rb +0 -70
  56. data/lib/ms/parser/mzxml/dom.rb +0 -182
  57. data/lib/ms/parser/mzxml/hpricot.rb +0 -253
  58. data/lib/ms/parser/mzxml/libxml.rb +0 -19
  59. data/lib/ms/parser/mzxml/regexp.rb +0 -122
  60. data/lib/ms/parser/mzxml/rexml.rb +0 -72
  61. data/lib/ms/parser/mzxml/xmlparser.rb +0 -248
  62. data/lib/ms/parser/mzxml.rb +0 -282
  63. data/lib/ms/parser.rb +0 -108
  64. data/lib/ms/precursor.rb +0 -25
  65. data/lib/ms/scan.rb +0 -81
  66. data/lib/mspire.rb +0 -4
  67. data/lib/pi_zero.rb +0 -244
  68. data/lib/qvalue.rb +0 -161
  69. data/lib/roc.rb +0 -187
  70. data/lib/sample_enzyme.rb +0 -160
  71. data/lib/scan_i.rb +0 -21
  72. data/lib/spec_id/aa_freqs.rb +0 -170
  73. data/lib/spec_id/bioworks.rb +0 -497
  74. data/lib/spec_id/digestor.rb +0 -138
  75. data/lib/spec_id/mass.rb +0 -179
  76. data/lib/spec_id/parser/proph.rb +0 -335
  77. data/lib/spec_id/precision/filter/cmdline.rb +0 -218
  78. data/lib/spec_id/precision/filter/interactive.rb +0 -134
  79. data/lib/spec_id/precision/filter/output.rb +0 -148
  80. data/lib/spec_id/precision/filter.rb +0 -637
  81. data/lib/spec_id/precision/output.rb +0 -60
  82. data/lib/spec_id/precision/prob/cmdline.rb +0 -160
  83. data/lib/spec_id/precision/prob/output.rb +0 -94
  84. data/lib/spec_id/precision/prob.rb +0 -249
  85. data/lib/spec_id/proph/pep_summary.rb +0 -104
  86. data/lib/spec_id/proph/prot_summary.rb +0 -484
  87. data/lib/spec_id/proph.rb +0 -4
  88. data/lib/spec_id/protein_summary.rb +0 -489
  89. data/lib/spec_id/sequest/params.rb +0 -316
  90. data/lib/spec_id/sequest/pepxml.rb +0 -1458
  91. data/lib/spec_id/sequest.rb +0 -33
  92. data/lib/spec_id/sqt.rb +0 -349
  93. data/lib/spec_id/srf.rb +0 -973
  94. data/lib/spec_id.rb +0 -778
  95. data/lib/spec_id_xml.rb +0 -99
  96. data/lib/transmem/phobius.rb +0 -147
  97. data/lib/transmem/toppred.rb +0 -368
  98. data/lib/transmem.rb +0 -157
  99. data/lib/validator/aa.rb +0 -48
  100. data/lib/validator/aa_est.rb +0 -112
  101. data/lib/validator/background.rb +0 -77
  102. data/lib/validator/bias.rb +0 -95
  103. data/lib/validator/cmdline.rb +0 -431
  104. data/lib/validator/decoy.rb +0 -107
  105. data/lib/validator/digestion_based.rb +0 -70
  106. data/lib/validator/probability.rb +0 -51
  107. data/lib/validator/prot_from_pep.rb +0 -234
  108. data/lib/validator/q_value.rb +0 -32
  109. data/lib/validator/transmem.rb +0 -272
  110. data/lib/validator/true_pos.rb +0 -46
  111. data/lib/validator.rb +0 -197
  112. data/lib/xml.rb +0 -38
  113. data/lib/xml_style_parser.rb +0 -119
  114. data/lib/xmlparser_wrapper.rb +0 -19
  115. data/release_notes.txt +0 -2
  116. data/script/compile_and_plot_smriti_final.rb +0 -97
  117. data/script/create_little_pepxml.rb +0 -61
  118. data/script/degenerate_peptides.rb +0 -47
  119. data/script/estimate_fpr_by_cysteine.rb +0 -226
  120. data/script/extract_gradient_programs.rb +0 -56
  121. data/script/find_cysteine_background.rb +0 -137
  122. data/script/genuine_tps_and_probs.rb +0 -136
  123. data/script/get_apex_values_rexml.rb +0 -44
  124. data/script/histogram_probs.rb +0 -61
  125. data/script/mascot_fix_pepxml.rb +0 -123
  126. data/script/msvis.rb +0 -42
  127. data/script/mzXML2timeIndex.rb +0 -25
  128. data/script/peps_per_bin.rb +0 -67
  129. data/script/prep_dir.rb +0 -121
  130. data/script/simple_protein_digestion.rb +0 -27
  131. data/script/smriti_final_analysis.rb +0 -103
  132. data/script/sqt_to_meta.rb +0 -24
  133. data/script/top_hit_per_scan.rb +0 -67
  134. data/script/toppred_to_yaml.rb +0 -47
  135. data/script/tpp_installer.rb +0 -249
  136. data/specs/align_spec.rb +0 -79
  137. data/specs/bin/bioworks_to_pepxml_spec.rb +0 -79
  138. data/specs/bin/fasta_shaker_spec.rb +0 -259
  139. data/specs/bin/filter_and_validate__multiple_vals_helper.yaml +0 -199
  140. data/specs/bin/filter_and_validate_spec.rb +0 -180
  141. data/specs/bin/ms_to_lmat_spec.rb +0 -34
  142. data/specs/bin/prob_validate_spec.rb +0 -86
  143. data/specs/bin/protein_summary_spec.rb +0 -14
  144. data/specs/fasta_spec.rb +0 -354
  145. data/specs/gi_spec.rb +0 -22
  146. data/specs/load_bin_path.rb +0 -7
  147. data/specs/merge_deep_spec.rb +0 -13
  148. data/specs/ms/gradient_program_spec.rb +0 -77
  149. data/specs/ms/msrun_spec.rb +0 -498
  150. data/specs/ms/parser_spec.rb +0 -92
  151. data/specs/ms/spectrum_spec.rb +0 -87
  152. data/specs/pi_zero_spec.rb +0 -115
  153. data/specs/qvalue_spec.rb +0 -39
  154. data/specs/roc_spec.rb +0 -251
  155. data/specs/rspec_autotest.rb +0 -149
  156. data/specs/sample_enzyme_spec.rb +0 -126
  157. data/specs/spec_helper.rb +0 -135
  158. data/specs/spec_id/aa_freqs_spec.rb +0 -52
  159. data/specs/spec_id/bioworks_spec.rb +0 -148
  160. data/specs/spec_id/digestor_spec.rb +0 -75
  161. data/specs/spec_id/precision/filter/cmdline_spec.rb +0 -20
  162. data/specs/spec_id/precision/filter/output_spec.rb +0 -31
  163. data/specs/spec_id/precision/filter_spec.rb +0 -246
  164. data/specs/spec_id/precision/prob_spec.rb +0 -44
  165. data/specs/spec_id/precision/prob_spec_helper.rb +0 -0
  166. data/specs/spec_id/proph/pep_summary_spec.rb +0 -98
  167. data/specs/spec_id/proph/prot_summary_spec.rb +0 -128
  168. data/specs/spec_id/protein_summary_spec.rb +0 -189
  169. data/specs/spec_id/sequest/params_spec.rb +0 -68
  170. data/specs/spec_id/sequest/pepxml_spec.rb +0 -374
  171. data/specs/spec_id/sequest_spec.rb +0 -38
  172. data/specs/spec_id/sqt_spec.rb +0 -246
  173. data/specs/spec_id/srf_spec.rb +0 -172
  174. data/specs/spec_id/srf_spec_helper.rb +0 -139
  175. data/specs/spec_id_helper.rb +0 -33
  176. data/specs/spec_id_spec.rb +0 -366
  177. data/specs/spec_id_xml_spec.rb +0 -33
  178. data/specs/transmem/phobius_spec.rb +0 -425
  179. data/specs/transmem/toppred_spec.rb +0 -298
  180. data/specs/transmem_spec.rb +0 -60
  181. data/specs/transmem_spec_shared.rb +0 -64
  182. data/specs/validator/aa_est_spec.rb +0 -66
  183. data/specs/validator/aa_spec.rb +0 -40
  184. data/specs/validator/background_spec.rb +0 -67
  185. data/specs/validator/bias_spec.rb +0 -122
  186. data/specs/validator/decoy_spec.rb +0 -51
  187. data/specs/validator/fasta_helper.rb +0 -26
  188. data/specs/validator/prot_from_pep_spec.rb +0 -141
  189. data/specs/validator/transmem_spec.rb +0 -146
  190. data/specs/validator/true_pos_spec.rb +0 -58
  191. data/specs/validator_helper.rb +0 -33
  192. data/specs/xml_spec.rb +0 -12
  193. data/test_files/000_pepxml18_small.xml +0 -206
  194. data/test_files/020a.mzXML.timeIndex +0 -4710
  195. data/test_files/4-03-03_mzXML/000.mzXML.timeIndex +0 -3973
  196. data/test_files/4-03-03_mzXML/020.mzXML.timeIndex +0 -3872
  197. data/test_files/4-03-03_small-prot.xml +0 -321
  198. data/test_files/4-03-03_small.xml +0 -3876
  199. data/test_files/7MIX_STD_110802_1.sequest_params_fragment.srf +0 -0
  200. data/test_files/bioworks-3.3_10prots.xml +0 -5999
  201. data/test_files/bioworks31.params +0 -77
  202. data/test_files/bioworks32.params +0 -62
  203. data/test_files/bioworks33.params +0 -63
  204. data/test_files/bioworks_single_run_small.xml +0 -7237
  205. data/test_files/bioworks_small.fasta +0 -212
  206. data/test_files/bioworks_small.params +0 -63
  207. data/test_files/bioworks_small.phobius +0 -109
  208. data/test_files/bioworks_small.toppred.out +0 -2847
  209. data/test_files/bioworks_small.xml +0 -5610
  210. data/test_files/bioworks_with_INV_small.xml +0 -3753
  211. data/test_files/bioworks_with_SHUFF_small.xml +0 -2503
  212. data/test_files/corrupted_900.srf +0 -0
  213. data/test_files/head_of_7MIX.srf +0 -0
  214. data/test_files/interact-opd1_mods_small-prot.xml +0 -304
  215. data/test_files/messups.fasta +0 -297
  216. data/test_files/opd1/000.my_answer.100lines.xml +0 -101
  217. data/test_files/opd1/000.tpp_1.2.3.first10.xml +0 -115
  218. data/test_files/opd1/000.tpp_2.9.2.first10.xml +0 -126
  219. data/test_files/opd1/000.v2.1.mzXML.timeIndex +0 -3748
  220. data/test_files/opd1/000_020-prot.png +0 -0
  221. data/test_files/opd1/000_020_3prots-prot.mod_initprob.xml +0 -62
  222. data/test_files/opd1/000_020_3prots-prot.xml +0 -62
  223. data/test_files/opd1/opd1_cat_inv_small-prot.xml +0 -139
  224. data/test_files/opd1/sequest.3.1.params +0 -77
  225. data/test_files/opd1/sequest.3.2.params +0 -62
  226. data/test_files/opd1/twenty_scans.mzXML +0 -418
  227. data/test_files/opd1/twenty_scans.v2.1.mzXML +0 -382
  228. data/test_files/opd1/twenty_scans_answ.lmat +0 -0
  229. data/test_files/opd1/twenty_scans_answ.lmata +0 -9
  230. data/test_files/opd1_020_beginning.RAW +0 -0
  231. data/test_files/opd1_2runs_2mods/data/020.mzData.xml +0 -683
  232. data/test_files/opd1_2runs_2mods/data/020.readw.mzXML +0 -382
  233. data/test_files/opd1_2runs_2mods/data/040.mzData.xml +0 -683
  234. data/test_files/opd1_2runs_2mods/data/040.readw.mzXML +0 -382
  235. data/test_files/opd1_2runs_2mods/data/README.txt +0 -6
  236. data/test_files/opd1_2runs_2mods/interact-opd1_mods__small.xml +0 -753
  237. data/test_files/orbitrap_mzData/000_cut.xml +0 -1920
  238. data/test_files/pepproph_small.xml +0 -4691
  239. data/test_files/phobius.small.noheader.txt +0 -50
  240. data/test_files/phobius.small.small.txt +0 -53
  241. data/test_files/s01_anC1_ld020mM.key.txt +0 -25
  242. data/test_files/s01_anC1_ld020mM.meth +0 -0
  243. data/test_files/small.fasta +0 -297
  244. data/test_files/small.sqt +0 -87
  245. data/test_files/smallraw.RAW +0 -0
  246. data/test_files/tf_bioworks2excel.bioXML +0 -14340
  247. data/test_files/tf_bioworks2excel.txt.actual +0 -1035
  248. data/test_files/toppred.small.out +0 -416
  249. data/test_files/toppred.xml.out +0 -318
  250. data/test_files/validator_hits_separate/bias_bioworks_small_HS.fasta +0 -7
  251. data/test_files/validator_hits_separate/bioworks_small_HS.xml +0 -5651
  252. data/test_files/yeast_gly_small-prot.xml +0 -265
  253. data/test_files/yeast_gly_small.1.0_1.0_1.0.parentTimes +0 -6
  254. data/test_files/yeast_gly_small.xml +0 -3807
  255. data/test_files/yeast_gly_small2.parentTimes +0 -6
@@ -1,123 +0,0 @@
1
- #!/usr/bin/ruby
2
-
3
- require 'rubygems'
4
- require 'ms/msrun'
5
- gem 'axml', '= 0.0.2'
6
-
7
- # returns an array containing one or two pairs of [cycle_num, time] that
8
- # represent the lowest and highest cycle numbers coupled to lowest and highest
9
- # time (in seconds) and the lowest and highest associated experiment numbers
10
- def get_cycle_exp_time_triplets(string)
11
- hash = {}
12
- cycle_index = nil
13
- ssplit = string.split(', ')
14
- ssplit.each_with_index do |piece,i|
15
- if piece =~ /^Cycle\(s\):/
16
- cycle_index = i
17
- break
18
- end
19
- end
20
- cycle_info = ssplit[cycle_index..-1].join(", ")
21
- #Cycle(s): 663, 675 (Experiment 2), 667 (Experiment 4)
22
- (header, info) = cycle_info.split(': ')
23
- cycles = []
24
- cycle_exp_pairs = []
25
- info.split('), ').each do |a|
26
- (nums, exp_num) = a.split('(')
27
- nums = nums.split(', ').map {|v| v.to_i }
28
- exp_num = exp_num.split(' ').last.sub(/\)$/,'').to_i
29
- nums.each {|v| cycle_exp_pairs << [v, exp_num] }
30
- end
31
-
32
- min = cycle_exp_pairs.min
33
- max = cycle_exp_pairs.max
34
-
35
- elution = ssplit.select {|v| v.match(/^Elution:(.*)/) }.first
36
- times = elution.split(': ').last
37
- times.sub!(/ min$/,'')
38
- times = times.split(' to ')
39
- times.map! do |v|
40
- (minutes, minute_decimals) = v.split('.')
41
- seconds = minutes.to_f * 60
42
- seconds + ( minute_decimals.to_f * 60 / 100 )
43
- end
44
-
45
- if max == min
46
- [[min.first, min.last, times.first]]
47
- else
48
- [[min.first, min.last, times.first], [max.first, max.last, times.last]]
49
- end
50
- end
51
-
52
- def get_scan_num(cycle, cycle_time, time_to_scan_num)
53
- # grossly inefficient, but guaranteed to get right answer!
54
- below_scan = nil
55
- time_to_scan_num.each do |scan_time, scan_num|
56
- if scan_time < cycle_time
57
- below_scan = scan_num
58
- else
59
- break # scan_time > cycle_time
60
- end
61
- end
62
- below_scan
63
- end
64
-
65
- #####################################################
66
- # MAIN:
67
- #####################################################
68
-
69
- additional_ext = ".with_scan_nums"
70
-
71
- if ARGV.size != 2
72
- puts "usage: #{File.basename(__FILE__)} <file>.pepXML <file>.mzXML"
73
- puts ""
74
- puts "uses information from the mzXML file to fix the pepXML file"
75
- puts "(adds in msms_run_summary: 'base_name' and 'raw_data' attributes;"
76
- puts " adds scan numbers based on cycle and experiment times)"
77
- puts ""
78
- puts "outputs: <file>#{additional_ext}.pepXML"
79
- exit
80
- end
81
-
82
- # get time_to_scan_num for msLevel=1 from the mzXML file
83
- (pepxml, mzxml) = ARGV
84
- mzxml_basename = File.basename(mzxml).sub(/\.mzxml$/i, '')
85
-
86
- ext = File.extname(pepxml)
87
- output = pepxml.sub(Regexp.new(Regexp.escape(ext)), additional_ext + ext)
88
-
89
- ms = MS::MSRun.new(mzxml, :lazy => :no_spectra)
90
- time_to_scan_num = ms.scans.select {|scan| scan.ms_level == 1 }.map do |scan|
91
- [scan.time, scan.num]
92
- end
93
-
94
- # update spectrum queries based on scan number
95
-
96
- root = AXML.parse_file(pepxml)
97
- # fix the basename stuff:
98
- msms_r_summary_n = root.child
99
- atts = msms_r_summary_n.attrs
100
- atts['base_name'] = mzxml_basename
101
- atts['raw_data'] = '.mzXML'
102
-
103
- root.child.find("child::spectrum_query").each do |sq|
104
- triplets = get_cycle_exp_time_triplets(sq['spectrum'])
105
- triplets.map! do |triplet|
106
- [get_scan_num(triplet[0], triplet[2], time_to_scan_num), *triplet]
107
- end
108
- # [scan_num, cycle, exp, time]
109
- quad = triplets.first
110
- first_scan_num = (quad[0] + quad[2] - 1)
111
- sq.attrs['start_scan'] = first_scan_num.to_s
112
- sq.attrs['end_scan'] =
113
- if triplets.size > 1
114
- quad = triplets.last
115
- (quad[0] + quad[2] - 1).to_s
116
- else
117
- first_scan_num.to_s
118
- end
119
- end
120
-
121
- xml_header = '<?xml version="1.0" encoding="UTF-8"?>'
122
- File.open(output, 'w') {|out| out.puts(xml_header); out.print root.to_s }
123
-
data/script/msvis.rb DELETED
@@ -1,42 +0,0 @@
1
- #!/usr/bin/ruby
2
-
3
-
4
- options_file = "local.cfg"
5
-
6
- moving_options_file = false
7
- mv_options_file = ""
8
- if File.exist?(options_file)
9
- mv_options_file = options_file + ".backup"
10
- File.rename(options_file, mv_options_file)
11
- moving_options_file = true
12
- end
13
-
14
- filetype = "msmat"
15
- files = ARGV.to_a
16
-
17
- base = "Msvis_filename"
18
-
19
- if files.size == 0
20
- puts "msvis.rb file.msmat ..."
21
- puts "right now only creates a local.cfg file"
22
- exit
23
- end
24
-
25
- File.open(options_file, "w") do |fh|
26
- fh.print "Msvis_filetype = " + filetype + "\n"
27
- fh.print "Msvis_num = " + files.size.to_s + "\n"
28
- cnt = 0
29
- files.each do |file|
30
- fh.print( base + cnt.to_s + " = " + "\"#{file}\"" + "\n" )
31
- cnt += 1
32
- end
33
- end
34
-
35
- #exec "./msvis"
36
-
37
- #File.unlink options_file
38
-
39
- #if moving_options_file
40
- # File.rename mv_options_file, options_file
41
- #end
42
-
@@ -1,25 +0,0 @@
1
- #!/usr/bin/ruby -w
2
-
3
- require 'spec/mzxml/parser'
4
- require 'spec/msrun'
5
- require 'rexml/document'
6
- include REXML
7
-
8
- if ARGV.size < 1
9
- puts "usage: #{File.basename(__FILE__)} file.mzXML ..."
10
- puts " outputs 'file.mzXML.timeIndex'"
11
- puts " which contains rows of:"
12
- puts " level scan_num time (if !msLevel1:) prec_mz prec_intensity"
13
- end
14
-
15
- # outputs rows of:
16
- # level scan_num time [precursor_mz precursor_intensity(if !msLevel1)]
17
-
18
- ARGV.each do |file|
19
- puts "READING: " + file
20
- outfile = file + '.timeIndex'
21
- obj = MS::MSRunIndex.new(file)
22
- puts "WRITING: " + outfile
23
- obj.to_index_file(outfile)
24
- end
25
-
@@ -1,67 +0,0 @@
1
- #!/usr/bin/ruby -w
2
-
3
- require 'generator'
4
- require 'optparse'
5
-
6
- require 'fasta'
7
- require 'sample_enzyme'
8
- require 'spec_id/digestor'
9
- require 'spec_id/mass'
10
- require 'vec'
11
-
12
- opt = {}
13
- opt[:missed_cleavages] = 0 # ~ parts per million
14
- opt[:bin_size] = 0.001 # ~ parts per million
15
- opt[:min] = 300.0
16
- opt[:max] = 4500.0
17
- opt[:h_plus] = 1.0
18
-
19
- opts = OptionParser.new do |op|
20
- op.banner = "usage: #{File.basename(__FILE__)} *.fasta"
21
- op.separator "Outputs a close estimate of number of peptides per bin."
22
- op.separator "Uses m+H+ as the peptide mass."
23
- op.separator "[for speed, assumes that there is a peptide mass close to the extremes]"
24
- op.on("-b", "--bin_size <F>", Float, "size of bins [#{opt[:bin_size]}]") {|v| opt[:bin_size] = v }
25
- op.on("-x", "--max <F>", Float, "max mass to accept [#{opt[:max]}]") {|v| opt[:max] = v }
26
- op.on("-n", "--min <F>", Float, "min mass to accept [#{opt[:min]}]") {|v| opt[:min] = v }
27
- op.on("-h", "--h_plus <F>", Float, "value of H+ to use [#{opt[:h_plus]}]") {|v| opt[:h_plus] = v }
28
- op.on("-m", "--missed_cleavages <N>", Integer, "num missed cleavages [#{opt[:missed_cleavages]}]") {|v| opt[:missed_cleavages] = v }
29
- end
30
-
31
- opts.parse!
32
-
33
- if ARGV.size == 0
34
- puts opts.to_s
35
- exit
36
- end
37
-
38
- min_mass = opt[:min]
39
- max_mass = opt[:max]
40
-
41
- ARGV.each do |file|
42
- fasta = Fasta.new(file)
43
- uniq_aaseqs = fasta.map do |prot|
44
- SampleEnzyme.tryptic(prot.aaseq, opt[:missed_cleavages])
45
- end.flatten.uniq
46
-
47
- masses = Mass::Calculator.new(Mass::MONO, opt[:h_plus]).masses(uniq_aaseqs)
48
- passing_masses = Mass::Calculator.new(Mass::MONO, opt[:h_plus]).masses(uniq_aaseqs).select do |mh|
49
- ((mh >= min_mass) and (mh <= max_mass))
50
- end
51
-
52
- ## warn if the masses aren't close to the end points
53
- if (max_mass - passing_masses.max) > 1.0
54
- warn "highest mass is not that close to max: #{passing_masses.max}"
55
- end
56
- if (passing_masses.min - min_mass) > 1.0
57
- warn "lowest mass is not that close to min: #{passing_masses.min}"
58
- end
59
-
60
- num_bins = (max_mass - min_mass) / opt[:bin_size]
61
-
62
- (bins, freqs) = VecD.new(passing_masses).histogram(num_bins)
63
-
64
- # report
65
- puts "#{file}: #{freqs.avg}"
66
-
67
- end
data/script/prep_dir.rb DELETED
@@ -1,121 +0,0 @@
1
- #!/usr/bin/ruby -w
2
-
3
-
4
- ## Assumes the same base
5
- module Kernel
6
-
7
- @@remove_raw = [/flush/, /equil/, /To_sequest/, /to_sequest/, /TempSequence/]
8
- @@seqext = '.sequest.zip'
9
- @@rawext = ['.RAW.zip', '.raw.zip']
10
-
11
- ## gets the basename of a file like this filename.RAW.zip or filename.raw.zip
12
- def get_basename(zip_file)
13
- basename = ""
14
- try1 = File.basename(zip_file, @@rawext[0])
15
- try2 = File.basename(zip_file, @@rawext[1])
16
-
17
- if try1.size < try2.size
18
- basename = try1
19
- elsif try1.size > try2.size
20
- basename = try2
21
- else #they are equal
22
- puts "something wrong at the basename"
23
- exit(1)
24
- end
25
- basename
26
- end
27
- def remove_extra_raw
28
- Dir.new(Dir.getwd).each do |test|
29
- @@remove_raw.each do |try|
30
- if test =~ try
31
- puts "removing " + test
32
- File.unlink test
33
- end
34
- end
35
- end
36
- end
37
-
38
- def raw2mzXML
39
- system "raw2mzXML.pl *.RAW"
40
- end
41
-
42
- def get_sequest_params(seqfile)
43
- unless File.exist?(seqfile)
44
- puts "couldn't find #{seqfile}"
45
- exit
46
- end
47
- basename = get_seq_basename(seqfile)
48
- extracted = basename + "/sequest.params"
49
- system "unzip #{seqzip} #{extracted}"
50
- return extracted
51
- end
52
-
53
- def get_seq_basename(file)
54
- File.basename(file, @@seqext)
55
- end
56
-
57
- end
58
-
59
- if ARGV.size < 1
60
- puts "usage: #{File.basename(__FILE__)} file.raw.zip"
61
- puts "This is specific to Peng's data to prepare it for OPD"
62
- exit
63
- end
64
-
65
- rawfiles = []
66
- seqfiles = []
67
- ARGV.each do |try|
68
- if try =~ /\.raw\.zip/
69
- rawfiles.push(try)
70
- elsif try =~ /\.sequest\.zip/
71
- seqfiles.push(try)
72
- else
73
- puts "skipping " + try
74
- end
75
- end
76
-
77
-
78
- ## depends on them being alphebetical
79
- (0..(rawfiles.size)).each do |cnt|
80
- rawfile = rawfiles[cnt]
81
- seqfile = seqfiles[cnt]
82
- break unless rawfile
83
- raw_basename = get_basename(rawfile)
84
- system("unzip #{rawfile}")
85
- puts "Basename: " + raw_basename
86
- current_dir = Dir.getwd
87
- unless Dir.chdir(raw_basename)
88
- puts "can't change to #{raw_basename}"
89
- exit
90
- end
91
- remove_extra_raw
92
- raw2mzXML
93
- system("mkdir raw")
94
- system("mkdir mzxml")
95
- system('mv *.RAW raw/')
96
- system('mv *.mzXML mzxml/')
97
- Dir.chdir(current_dir)
98
- rawzip = raw_basename + '.raw.zip'
99
- mzxmlzip = raw_basename + '.mzxml.zip'
100
- system("zip -r #{rawzip} #{raw_basename}/raw/*")
101
- system("zip -r #{mzxmlzip} #{raw_basename}/mzxml/*")
102
- system("mv #{rawzip} #{raw_basename}")
103
- system("mv #{mzxmlzip} #{raw_basename}")
104
- Dir.chdir(raw_basename)
105
- if (Dir.glob("*.zip").size == 2)
106
- system("rm -rf raw")
107
- system("rm -rf mzxml")
108
- end
109
- Dir.chdir current_dir
110
-
111
- ## get the sequest.params file:
112
- extracted = get_sequest_params(seqfile)
113
- system("mv #{extracted} #{raw_basename}")
114
-
115
- ## move the sequest file in
116
- system("chmod 664 #{seqfile}")
117
- system("mv #{seqfile} #{raw_basename}")
118
-
119
- end
120
-
121
-
@@ -1,27 +0,0 @@
1
- #!/usr/bin/ruby -w
2
-
3
- require 'fasta'
4
- require 'sample_enzyme'
5
-
6
- if ARGV.size < 2
7
- puts "usage: #{File.basename(__FILE__)} missed_cleavages <file>.fasta ..."
8
- puts " returns <file>.missed_cleavages_<missed_cleavages>.peptides"
9
- abort
10
- end
11
-
12
- missed_cleavages = ARGV.shift.to_i
13
-
14
- ARGV.each do |file|
15
-
16
- if file !~ /\.fasta/
17
- abort "must be a fasta file with extension fasta"
18
- end
19
- new_filename = file.sub(/\.fasta$/, '')
20
- new_filename << ".missed_cleavages_#{missed_cleavages}.peptides"
21
- File.open(new_filename, "w") do |fh|
22
- peptides = []
23
- Fasta.new.read_file(file).prots.each do |prot|
24
- fh.puts( prot.header.split(/\s+/).first.sub(/^>/,'') + "\t" + SampleEnzyme.tryptic(prot.aaseq, missed_cleavages).join(" ") )
25
- end
26
- end
27
- end
@@ -1,103 +0,0 @@
1
- #!/usr/bin/ruby -w
2
-
3
- require 'spec_id'
4
- require 'fasta'
5
- require 'optparse'
6
-
7
- $top = false
8
- opts = OptionParser.new do |op|
9
- op.banner = "usage: #{File.basename(__FILE__)} bioworks.xml <file>.fasta|prefix"
10
- op.separator "outputs stdout (tab del sorted by probability) probability, file:aaseq:charge T/F"
11
- op.separator "hashes on file+aaseq+charge"
12
- op.on("-t", "--top", "only top peptide (by prob) per scan+charge") do
13
- $top = true
14
- end
15
- end
16
-
17
- opts.parse!
18
-
19
- if ARGV.size < 2
20
- puts opts.to_s
21
- exit
22
- end
23
-
24
- specid_file = ARGV.shift
25
- file_or_prefix = ARGV.shift
26
-
27
- specid = SpecID.new(specid_file)
28
-
29
- indicator =
30
- if File.exist? file_or_prefix
31
- Fasta.new.read_file(file_or_prefix)
32
- else
33
- file_or_prefix
34
- end
35
-
36
-
37
- # returns an array containing the min prob peptides (in case of a tie)
38
- def lowest_peps(ar)
39
- min_prob = ar.min {|a,b| a.probability.to_f <=> b.probability.to_f }.probability.to_f
40
- ar.select {|v| v.probability.to_f == min_prob }
41
- end
42
-
43
- peps = specid.peps
44
- if $top
45
- top_by_scan = []
46
- peps.hash_by(:base_name, :first_scan).each do |k,v|
47
- low_peps = lowest_peps(v)
48
- top_by_scan.push( *low_peps )
49
- end
50
- end
51
-
52
- results = top_by_scan.hash_by(:base_name, :aaseq, :charge).map do |k,v|
53
- low_peps = lowest_peps(v)
54
- #min_pep = v.min {|a,b| a.probability.to_f <=> b.probability.to_f }
55
- all_prots = []
56
- low_peps.each do |pep|
57
- all_prot_references.push( *(pep.prots.map {|v| v.reference }) )
58
- end
59
- all_prot_references.uniq!
60
- is_true =
61
- if indicator.is_a? Fasta
62
- all_prot_references.any? do |ref|
63
- indicator.included_in_header?(ref)
64
- end
65
- else
66
- !(all_prot_references.all? {|ref| ref.include?( indicator )})
67
- end
68
- [min_pep.probability.to_f, k, is_true]
69
- end
70
-
71
- results.sort.each do |result|
72
- report = [result[0], result[1].join(':'), (result[2] ? 'T' : 'F')]
73
- puts report.join("\t")
74
- end
75
-
76
- =begin
77
- # ORIGINAL CODE
78
- peps = specid.peps
79
- if $top
80
- peps = peps.hash_by(:base_name, :first_scan).map do |k,v|
81
- v.min {|a,b| a.probability.to_f <=> b.probability.to_f }
82
- end
83
- end
84
-
85
- results = peps.hash_by(:base_name, :aaseq, :charge).map do |k,v|
86
- min_pep = v.min {|a,b| a.probability.to_f <=> b.probability.to_f }
87
- references = min_pep.prots.map {|v| v.reference }.uniq
88
- is_true =
89
- if indicator.is_a? Fasta
90
- references.any? do |ref|
91
- indicator.included_in_header?(ref)
92
- end
93
- else
94
- !(references.all? {|ref| ref.include?( indicator )})
95
- end
96
- [min_pep.probability.to_f, k, is_true]
97
- end
98
-
99
- results.sort.each do |result|
100
- report = [result[0], result[1].join(':'), (result[2] ? 'T' : 'F')]
101
- puts report.join("\t")
102
- end
103
- =end
@@ -1,24 +0,0 @@
1
- #!/usr/bin/ruby -s
2
-
3
- require 'optparse'
4
-
5
- $outfile = 'meta.sqm'
6
- opts = OptionParser.new do |op|
7
- op.banner = "usage: #{File.basename(__FILE__)} <file>.sqt ..."
8
- op.separator "outputs meta.sqm (a sqt meta file)"
9
- op.on("-o", "--outfile <file>", "currently: #{$outfile}") {|v| $outfile = v}
10
- end
11
-
12
- opts.parse!
13
-
14
- if ARGV.size == 0
15
- puts opts.to_s
16
- exit
17
- end
18
-
19
- File.open($outfile, 'w') do |out|
20
- ARGV.each do |file|
21
- out.puts File.expand_path(file)
22
- end
23
- end
24
-
@@ -1,67 +0,0 @@
1
- #!/usr/bin/ruby -w
2
-
3
- ###################################################################
4
- cats = %w(base_name sequence xcorr deltacn first_scan last_scan)
5
- ###################################################################
6
-
7
- require 'spec_id'
8
- require 'hash_by'
9
-
10
- extension_top = '.top_per_scan.txt'
11
- extension_all = '.all_peps_per_scan.txt'
12
-
13
- if ARGV.size < 1
14
- puts "usage: #{File.basename(__FILE__)} <file>.xml"
15
- puts "output: <file>#{extension}"
16
- puts ""
17
- puts "Generates top hit (highest xcorr) per scan."
18
- exit
19
- end
20
-
21
- def print_doc(outfile, headers, table_a_of_a)
22
- document = table_a_of_a.map do |line|
23
- line.join("\t")
24
- end.join("\n")
25
- File.open(outfile, 'w') do |out|
26
- out.print headers.join("\t") + "\n"
27
- out.print document
28
- end
29
- end
30
-
31
-
32
- def pep_array_to_table(peps, send_to)
33
- arr_of_arr = peps.map do |pep|
34
- arr = send_to.map {|sym| pep.send(sym) }
35
- arr.unshift( pep.prot.reference ) # hacked on
36
- end
37
- end
38
-
39
- ###############################################
40
- # MAIN:
41
- ###############################################
42
-
43
- file = ARGV[0]
44
- outfile_top = file.sub(/\.xml$/, extension_top)
45
- outfile_all = file.sub(/\.xml$/, extension_all)
46
-
47
- sp = SpecID.new(file)
48
-
49
- # The old (incorrect version)
50
- # pep_hash = sp.peps.hash_by(:first_scan, :last_scan)
51
- # The correct version:
52
- pep_hash = sp.peps.hash_by(:base_name, :first_scan, :last_scan)
53
- top_per_scan = pep_hash.map {|k,v| v.sort_by {|ob| ob.xcorr.to_f }.last }
54
- top_per_scan = top_per_scan.sort_by {|pep| pep.first_scan.to_i }
55
-
56
- all_peps = sp.peps.sort_by do |pep| [pep.first_scan.to_i, -1.0 * pep.xcorr.to_f] end
57
-
58
- cats_sym = cats.map {|v| v.to_sym }
59
-
60
- a_of_a_top = pep_array_to_table(top_per_scan, cats_sym)
61
- a_of_a_all = pep_array_to_table(all_peps, cats_sym)
62
-
63
- cats.unshift "protein_reference"
64
-
65
- print_doc(outfile_top, cats, a_of_a_top)
66
- print_doc(outfile_all, cats, a_of_a_all)
67
-
@@ -1,47 +0,0 @@
1
- #!/usr/bin/ruby -w
2
-
3
-
4
- require 'optparse'
5
-
6
- opt = {}
7
- opt[:probability] = 1.0
8
- opts = OptionParser.new do |op|
9
- op.banner = "USAGE: #{File.basename(__FILE__)} toppred.out"
10
- op.separator "Outputs toppred.yaml"
11
- op.separator "takes the highest probability structure"
12
- op.separator "for best structures of equal probability, takes first given"
13
- op.separator "Each line contains:"
14
- op.separator "<identifier>: String :"
15
- op.separator " num_found: Int"
16
- op.separator " num_certain_transmembrane_segments: Int"
17
- op.separator " num_putative_transmembrane_segments: Int"
18
- op.separator " best_structure_probability: Float"
19
- op.separator " transmembrane_segments:"
20
- op.separator " - probability: Float"
21
- op.separator " start: Int"
22
- op.separator " stop: Int"
23
- op.separator " aaseq: String"
24
- op.separator ""
25
- op.separator "OPTIONS:"
26
- op.on("-p", "--probability", Float, "min structure prob threshold (default #{opt[:probability]})") {|v| opt[:probability] = v}
27
- end
28
-
29
- opts.parse!
30
-
31
-
32
- if ARGV.size == 0
33
- puts opts
34
- exit
35
- end
36
-
37
- file = ARGV.shift
38
-
39
- File.open(file) do |fh|
40
- hash = Transmem.read_toppred(fh)
41
- end
42
-
43
- puts hash.to_yaml
44
-
45
-
46
-
47
-