mspire 0.4.9 → 0.5.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (255) hide show
  1. data/README +27 -17
  2. data/changelog.txt +31 -62
  3. data/lib/ms/calc.rb +32 -0
  4. data/lib/ms/data/interleaved.rb +60 -0
  5. data/lib/ms/data/lazy_io.rb +73 -0
  6. data/lib/ms/data/lazy_string.rb +15 -0
  7. data/lib/ms/data/simple.rb +59 -0
  8. data/lib/ms/data/transposed.rb +41 -0
  9. data/lib/ms/data.rb +57 -0
  10. data/lib/ms/format/format_error.rb +12 -0
  11. data/lib/ms/spectrum.rb +25 -384
  12. data/lib/ms/support/binary_search.rb +126 -0
  13. data/lib/ms.rb +10 -10
  14. metadata +38 -350
  15. data/INSTALL +0 -58
  16. data/README.rdoc +0 -18
  17. data/Rakefile +0 -330
  18. data/bin/aafreqs.rb +0 -23
  19. data/bin/bioworks2excel.rb +0 -14
  20. data/bin/bioworks_to_pepxml.rb +0 -148
  21. data/bin/bioworks_to_pepxml_gui.rb +0 -225
  22. data/bin/fasta_shaker.rb +0 -5
  23. data/bin/filter_and_validate.rb +0 -5
  24. data/bin/gi2annot.rb +0 -14
  25. data/bin/id_class_anal.rb +0 -112
  26. data/bin/id_precision.rb +0 -172
  27. data/bin/ms_to_lmat.rb +0 -67
  28. data/bin/pepproph_filter.rb +0 -16
  29. data/bin/prob_validate.rb +0 -6
  30. data/bin/protein_summary.rb +0 -6
  31. data/bin/protxml2prots_peps.rb +0 -32
  32. data/bin/raw_to_mzXML.rb +0 -55
  33. data/bin/run_percolator.rb +0 -122
  34. data/bin/sqt_group.rb +0 -26
  35. data/bin/srf_group.rb +0 -27
  36. data/bin/srf_to_sqt.rb +0 -40
  37. data/lib/align/chams.rb +0 -78
  38. data/lib/align.rb +0 -154
  39. data/lib/archive/targz.rb +0 -94
  40. data/lib/bsearch.rb +0 -120
  41. data/lib/core_extensions.rb +0 -16
  42. data/lib/fasta.rb +0 -626
  43. data/lib/gi.rb +0 -124
  44. data/lib/group_by.rb +0 -10
  45. data/lib/index_by.rb +0 -11
  46. data/lib/merge_deep.rb +0 -21
  47. data/lib/ms/converter/mzxml.rb +0 -77
  48. data/lib/ms/gradient_program.rb +0 -170
  49. data/lib/ms/msrun.rb +0 -244
  50. data/lib/ms/msrun_index.rb +0 -108
  51. data/lib/ms/parser/mzdata/axml.rb +0 -67
  52. data/lib/ms/parser/mzdata/dom.rb +0 -175
  53. data/lib/ms/parser/mzdata/libxml.rb +0 -7
  54. data/lib/ms/parser/mzdata.rb +0 -31
  55. data/lib/ms/parser/mzxml/axml.rb +0 -70
  56. data/lib/ms/parser/mzxml/dom.rb +0 -182
  57. data/lib/ms/parser/mzxml/hpricot.rb +0 -253
  58. data/lib/ms/parser/mzxml/libxml.rb +0 -19
  59. data/lib/ms/parser/mzxml/regexp.rb +0 -122
  60. data/lib/ms/parser/mzxml/rexml.rb +0 -72
  61. data/lib/ms/parser/mzxml/xmlparser.rb +0 -248
  62. data/lib/ms/parser/mzxml.rb +0 -282
  63. data/lib/ms/parser.rb +0 -108
  64. data/lib/ms/precursor.rb +0 -25
  65. data/lib/ms/scan.rb +0 -81
  66. data/lib/mspire.rb +0 -4
  67. data/lib/pi_zero.rb +0 -244
  68. data/lib/qvalue.rb +0 -161
  69. data/lib/roc.rb +0 -187
  70. data/lib/sample_enzyme.rb +0 -160
  71. data/lib/scan_i.rb +0 -21
  72. data/lib/spec_id/aa_freqs.rb +0 -170
  73. data/lib/spec_id/bioworks.rb +0 -497
  74. data/lib/spec_id/digestor.rb +0 -138
  75. data/lib/spec_id/mass.rb +0 -179
  76. data/lib/spec_id/parser/proph.rb +0 -335
  77. data/lib/spec_id/precision/filter/cmdline.rb +0 -218
  78. data/lib/spec_id/precision/filter/interactive.rb +0 -134
  79. data/lib/spec_id/precision/filter/output.rb +0 -148
  80. data/lib/spec_id/precision/filter.rb +0 -637
  81. data/lib/spec_id/precision/output.rb +0 -60
  82. data/lib/spec_id/precision/prob/cmdline.rb +0 -160
  83. data/lib/spec_id/precision/prob/output.rb +0 -94
  84. data/lib/spec_id/precision/prob.rb +0 -249
  85. data/lib/spec_id/proph/pep_summary.rb +0 -104
  86. data/lib/spec_id/proph/prot_summary.rb +0 -484
  87. data/lib/spec_id/proph.rb +0 -4
  88. data/lib/spec_id/protein_summary.rb +0 -489
  89. data/lib/spec_id/sequest/params.rb +0 -316
  90. data/lib/spec_id/sequest/pepxml.rb +0 -1458
  91. data/lib/spec_id/sequest.rb +0 -33
  92. data/lib/spec_id/sqt.rb +0 -349
  93. data/lib/spec_id/srf.rb +0 -973
  94. data/lib/spec_id.rb +0 -778
  95. data/lib/spec_id_xml.rb +0 -99
  96. data/lib/transmem/phobius.rb +0 -147
  97. data/lib/transmem/toppred.rb +0 -368
  98. data/lib/transmem.rb +0 -157
  99. data/lib/validator/aa.rb +0 -48
  100. data/lib/validator/aa_est.rb +0 -112
  101. data/lib/validator/background.rb +0 -77
  102. data/lib/validator/bias.rb +0 -95
  103. data/lib/validator/cmdline.rb +0 -431
  104. data/lib/validator/decoy.rb +0 -107
  105. data/lib/validator/digestion_based.rb +0 -70
  106. data/lib/validator/probability.rb +0 -51
  107. data/lib/validator/prot_from_pep.rb +0 -234
  108. data/lib/validator/q_value.rb +0 -32
  109. data/lib/validator/transmem.rb +0 -272
  110. data/lib/validator/true_pos.rb +0 -46
  111. data/lib/validator.rb +0 -197
  112. data/lib/xml.rb +0 -38
  113. data/lib/xml_style_parser.rb +0 -119
  114. data/lib/xmlparser_wrapper.rb +0 -19
  115. data/release_notes.txt +0 -2
  116. data/script/compile_and_plot_smriti_final.rb +0 -97
  117. data/script/create_little_pepxml.rb +0 -61
  118. data/script/degenerate_peptides.rb +0 -47
  119. data/script/estimate_fpr_by_cysteine.rb +0 -226
  120. data/script/extract_gradient_programs.rb +0 -56
  121. data/script/find_cysteine_background.rb +0 -137
  122. data/script/genuine_tps_and_probs.rb +0 -136
  123. data/script/get_apex_values_rexml.rb +0 -44
  124. data/script/histogram_probs.rb +0 -61
  125. data/script/mascot_fix_pepxml.rb +0 -123
  126. data/script/msvis.rb +0 -42
  127. data/script/mzXML2timeIndex.rb +0 -25
  128. data/script/peps_per_bin.rb +0 -67
  129. data/script/prep_dir.rb +0 -121
  130. data/script/simple_protein_digestion.rb +0 -27
  131. data/script/smriti_final_analysis.rb +0 -103
  132. data/script/sqt_to_meta.rb +0 -24
  133. data/script/top_hit_per_scan.rb +0 -67
  134. data/script/toppred_to_yaml.rb +0 -47
  135. data/script/tpp_installer.rb +0 -249
  136. data/specs/align_spec.rb +0 -79
  137. data/specs/bin/bioworks_to_pepxml_spec.rb +0 -79
  138. data/specs/bin/fasta_shaker_spec.rb +0 -259
  139. data/specs/bin/filter_and_validate__multiple_vals_helper.yaml +0 -199
  140. data/specs/bin/filter_and_validate_spec.rb +0 -180
  141. data/specs/bin/ms_to_lmat_spec.rb +0 -34
  142. data/specs/bin/prob_validate_spec.rb +0 -86
  143. data/specs/bin/protein_summary_spec.rb +0 -14
  144. data/specs/fasta_spec.rb +0 -354
  145. data/specs/gi_spec.rb +0 -22
  146. data/specs/load_bin_path.rb +0 -7
  147. data/specs/merge_deep_spec.rb +0 -13
  148. data/specs/ms/gradient_program_spec.rb +0 -77
  149. data/specs/ms/msrun_spec.rb +0 -498
  150. data/specs/ms/parser_spec.rb +0 -92
  151. data/specs/ms/spectrum_spec.rb +0 -87
  152. data/specs/pi_zero_spec.rb +0 -115
  153. data/specs/qvalue_spec.rb +0 -39
  154. data/specs/roc_spec.rb +0 -251
  155. data/specs/rspec_autotest.rb +0 -149
  156. data/specs/sample_enzyme_spec.rb +0 -126
  157. data/specs/spec_helper.rb +0 -135
  158. data/specs/spec_id/aa_freqs_spec.rb +0 -52
  159. data/specs/spec_id/bioworks_spec.rb +0 -148
  160. data/specs/spec_id/digestor_spec.rb +0 -75
  161. data/specs/spec_id/precision/filter/cmdline_spec.rb +0 -20
  162. data/specs/spec_id/precision/filter/output_spec.rb +0 -31
  163. data/specs/spec_id/precision/filter_spec.rb +0 -246
  164. data/specs/spec_id/precision/prob_spec.rb +0 -44
  165. data/specs/spec_id/precision/prob_spec_helper.rb +0 -0
  166. data/specs/spec_id/proph/pep_summary_spec.rb +0 -98
  167. data/specs/spec_id/proph/prot_summary_spec.rb +0 -128
  168. data/specs/spec_id/protein_summary_spec.rb +0 -189
  169. data/specs/spec_id/sequest/params_spec.rb +0 -68
  170. data/specs/spec_id/sequest/pepxml_spec.rb +0 -374
  171. data/specs/spec_id/sequest_spec.rb +0 -38
  172. data/specs/spec_id/sqt_spec.rb +0 -246
  173. data/specs/spec_id/srf_spec.rb +0 -172
  174. data/specs/spec_id/srf_spec_helper.rb +0 -139
  175. data/specs/spec_id_helper.rb +0 -33
  176. data/specs/spec_id_spec.rb +0 -366
  177. data/specs/spec_id_xml_spec.rb +0 -33
  178. data/specs/transmem/phobius_spec.rb +0 -425
  179. data/specs/transmem/toppred_spec.rb +0 -298
  180. data/specs/transmem_spec.rb +0 -60
  181. data/specs/transmem_spec_shared.rb +0 -64
  182. data/specs/validator/aa_est_spec.rb +0 -66
  183. data/specs/validator/aa_spec.rb +0 -40
  184. data/specs/validator/background_spec.rb +0 -67
  185. data/specs/validator/bias_spec.rb +0 -122
  186. data/specs/validator/decoy_spec.rb +0 -51
  187. data/specs/validator/fasta_helper.rb +0 -26
  188. data/specs/validator/prot_from_pep_spec.rb +0 -141
  189. data/specs/validator/transmem_spec.rb +0 -146
  190. data/specs/validator/true_pos_spec.rb +0 -58
  191. data/specs/validator_helper.rb +0 -33
  192. data/specs/xml_spec.rb +0 -12
  193. data/test_files/000_pepxml18_small.xml +0 -206
  194. data/test_files/020a.mzXML.timeIndex +0 -4710
  195. data/test_files/4-03-03_mzXML/000.mzXML.timeIndex +0 -3973
  196. data/test_files/4-03-03_mzXML/020.mzXML.timeIndex +0 -3872
  197. data/test_files/4-03-03_small-prot.xml +0 -321
  198. data/test_files/4-03-03_small.xml +0 -3876
  199. data/test_files/7MIX_STD_110802_1.sequest_params_fragment.srf +0 -0
  200. data/test_files/bioworks-3.3_10prots.xml +0 -5999
  201. data/test_files/bioworks31.params +0 -77
  202. data/test_files/bioworks32.params +0 -62
  203. data/test_files/bioworks33.params +0 -63
  204. data/test_files/bioworks_single_run_small.xml +0 -7237
  205. data/test_files/bioworks_small.fasta +0 -212
  206. data/test_files/bioworks_small.params +0 -63
  207. data/test_files/bioworks_small.phobius +0 -109
  208. data/test_files/bioworks_small.toppred.out +0 -2847
  209. data/test_files/bioworks_small.xml +0 -5610
  210. data/test_files/bioworks_with_INV_small.xml +0 -3753
  211. data/test_files/bioworks_with_SHUFF_small.xml +0 -2503
  212. data/test_files/corrupted_900.srf +0 -0
  213. data/test_files/head_of_7MIX.srf +0 -0
  214. data/test_files/interact-opd1_mods_small-prot.xml +0 -304
  215. data/test_files/messups.fasta +0 -297
  216. data/test_files/opd1/000.my_answer.100lines.xml +0 -101
  217. data/test_files/opd1/000.tpp_1.2.3.first10.xml +0 -115
  218. data/test_files/opd1/000.tpp_2.9.2.first10.xml +0 -126
  219. data/test_files/opd1/000.v2.1.mzXML.timeIndex +0 -3748
  220. data/test_files/opd1/000_020-prot.png +0 -0
  221. data/test_files/opd1/000_020_3prots-prot.mod_initprob.xml +0 -62
  222. data/test_files/opd1/000_020_3prots-prot.xml +0 -62
  223. data/test_files/opd1/opd1_cat_inv_small-prot.xml +0 -139
  224. data/test_files/opd1/sequest.3.1.params +0 -77
  225. data/test_files/opd1/sequest.3.2.params +0 -62
  226. data/test_files/opd1/twenty_scans.mzXML +0 -418
  227. data/test_files/opd1/twenty_scans.v2.1.mzXML +0 -382
  228. data/test_files/opd1/twenty_scans_answ.lmat +0 -0
  229. data/test_files/opd1/twenty_scans_answ.lmata +0 -9
  230. data/test_files/opd1_020_beginning.RAW +0 -0
  231. data/test_files/opd1_2runs_2mods/data/020.mzData.xml +0 -683
  232. data/test_files/opd1_2runs_2mods/data/020.readw.mzXML +0 -382
  233. data/test_files/opd1_2runs_2mods/data/040.mzData.xml +0 -683
  234. data/test_files/opd1_2runs_2mods/data/040.readw.mzXML +0 -382
  235. data/test_files/opd1_2runs_2mods/data/README.txt +0 -6
  236. data/test_files/opd1_2runs_2mods/interact-opd1_mods__small.xml +0 -753
  237. data/test_files/orbitrap_mzData/000_cut.xml +0 -1920
  238. data/test_files/pepproph_small.xml +0 -4691
  239. data/test_files/phobius.small.noheader.txt +0 -50
  240. data/test_files/phobius.small.small.txt +0 -53
  241. data/test_files/s01_anC1_ld020mM.key.txt +0 -25
  242. data/test_files/s01_anC1_ld020mM.meth +0 -0
  243. data/test_files/small.fasta +0 -297
  244. data/test_files/small.sqt +0 -87
  245. data/test_files/smallraw.RAW +0 -0
  246. data/test_files/tf_bioworks2excel.bioXML +0 -14340
  247. data/test_files/tf_bioworks2excel.txt.actual +0 -1035
  248. data/test_files/toppred.small.out +0 -416
  249. data/test_files/toppred.xml.out +0 -318
  250. data/test_files/validator_hits_separate/bias_bioworks_small_HS.fasta +0 -7
  251. data/test_files/validator_hits_separate/bioworks_small_HS.xml +0 -5651
  252. data/test_files/yeast_gly_small-prot.xml +0 -265
  253. data/test_files/yeast_gly_small.1.0_1.0_1.0.parentTimes +0 -6
  254. data/test_files/yeast_gly_small.xml +0 -3807
  255. data/test_files/yeast_gly_small2.parentTimes +0 -6
@@ -1,123 +0,0 @@
1
- #!/usr/bin/ruby
2
-
3
- require 'rubygems'
4
- require 'ms/msrun'
5
- gem 'axml', '= 0.0.2'
6
-
7
- # returns an array containing one or two pairs of [cycle_num, time] that
8
- # represent the lowest and highest cycle numbers coupled to lowest and highest
9
- # time (in seconds) and the lowest and highest associated experiment numbers
10
- def get_cycle_exp_time_triplets(string)
11
- hash = {}
12
- cycle_index = nil
13
- ssplit = string.split(', ')
14
- ssplit.each_with_index do |piece,i|
15
- if piece =~ /^Cycle\(s\):/
16
- cycle_index = i
17
- break
18
- end
19
- end
20
- cycle_info = ssplit[cycle_index..-1].join(", ")
21
- #Cycle(s): 663, 675 (Experiment 2), 667 (Experiment 4)
22
- (header, info) = cycle_info.split(': ')
23
- cycles = []
24
- cycle_exp_pairs = []
25
- info.split('), ').each do |a|
26
- (nums, exp_num) = a.split('(')
27
- nums = nums.split(', ').map {|v| v.to_i }
28
- exp_num = exp_num.split(' ').last.sub(/\)$/,'').to_i
29
- nums.each {|v| cycle_exp_pairs << [v, exp_num] }
30
- end
31
-
32
- min = cycle_exp_pairs.min
33
- max = cycle_exp_pairs.max
34
-
35
- elution = ssplit.select {|v| v.match(/^Elution:(.*)/) }.first
36
- times = elution.split(': ').last
37
- times.sub!(/ min$/,'')
38
- times = times.split(' to ')
39
- times.map! do |v|
40
- (minutes, minute_decimals) = v.split('.')
41
- seconds = minutes.to_f * 60
42
- seconds + ( minute_decimals.to_f * 60 / 100 )
43
- end
44
-
45
- if max == min
46
- [[min.first, min.last, times.first]]
47
- else
48
- [[min.first, min.last, times.first], [max.first, max.last, times.last]]
49
- end
50
- end
51
-
52
- def get_scan_num(cycle, cycle_time, time_to_scan_num)
53
- # grossly inefficient, but guaranteed to get right answer!
54
- below_scan = nil
55
- time_to_scan_num.each do |scan_time, scan_num|
56
- if scan_time < cycle_time
57
- below_scan = scan_num
58
- else
59
- break # scan_time > cycle_time
60
- end
61
- end
62
- below_scan
63
- end
64
-
65
- #####################################################
66
- # MAIN:
67
- #####################################################
68
-
69
- additional_ext = ".with_scan_nums"
70
-
71
- if ARGV.size != 2
72
- puts "usage: #{File.basename(__FILE__)} <file>.pepXML <file>.mzXML"
73
- puts ""
74
- puts "uses information from the mzXML file to fix the pepXML file"
75
- puts "(adds in msms_run_summary: 'base_name' and 'raw_data' attributes;"
76
- puts " adds scan numbers based on cycle and experiment times)"
77
- puts ""
78
- puts "outputs: <file>#{additional_ext}.pepXML"
79
- exit
80
- end
81
-
82
- # get time_to_scan_num for msLevel=1 from the mzXML file
83
- (pepxml, mzxml) = ARGV
84
- mzxml_basename = File.basename(mzxml).sub(/\.mzxml$/i, '')
85
-
86
- ext = File.extname(pepxml)
87
- output = pepxml.sub(Regexp.new(Regexp.escape(ext)), additional_ext + ext)
88
-
89
- ms = MS::MSRun.new(mzxml, :lazy => :no_spectra)
90
- time_to_scan_num = ms.scans.select {|scan| scan.ms_level == 1 }.map do |scan|
91
- [scan.time, scan.num]
92
- end
93
-
94
- # update spectrum queries based on scan number
95
-
96
- root = AXML.parse_file(pepxml)
97
- # fix the basename stuff:
98
- msms_r_summary_n = root.child
99
- atts = msms_r_summary_n.attrs
100
- atts['base_name'] = mzxml_basename
101
- atts['raw_data'] = '.mzXML'
102
-
103
- root.child.find("child::spectrum_query").each do |sq|
104
- triplets = get_cycle_exp_time_triplets(sq['spectrum'])
105
- triplets.map! do |triplet|
106
- [get_scan_num(triplet[0], triplet[2], time_to_scan_num), *triplet]
107
- end
108
- # [scan_num, cycle, exp, time]
109
- quad = triplets.first
110
- first_scan_num = (quad[0] + quad[2] - 1)
111
- sq.attrs['start_scan'] = first_scan_num.to_s
112
- sq.attrs['end_scan'] =
113
- if triplets.size > 1
114
- quad = triplets.last
115
- (quad[0] + quad[2] - 1).to_s
116
- else
117
- first_scan_num.to_s
118
- end
119
- end
120
-
121
- xml_header = '<?xml version="1.0" encoding="UTF-8"?>'
122
- File.open(output, 'w') {|out| out.puts(xml_header); out.print root.to_s }
123
-
data/script/msvis.rb DELETED
@@ -1,42 +0,0 @@
1
- #!/usr/bin/ruby
2
-
3
-
4
- options_file = "local.cfg"
5
-
6
- moving_options_file = false
7
- mv_options_file = ""
8
- if File.exist?(options_file)
9
- mv_options_file = options_file + ".backup"
10
- File.rename(options_file, mv_options_file)
11
- moving_options_file = true
12
- end
13
-
14
- filetype = "msmat"
15
- files = ARGV.to_a
16
-
17
- base = "Msvis_filename"
18
-
19
- if files.size == 0
20
- puts "msvis.rb file.msmat ..."
21
- puts "right now only creates a local.cfg file"
22
- exit
23
- end
24
-
25
- File.open(options_file, "w") do |fh|
26
- fh.print "Msvis_filetype = " + filetype + "\n"
27
- fh.print "Msvis_num = " + files.size.to_s + "\n"
28
- cnt = 0
29
- files.each do |file|
30
- fh.print( base + cnt.to_s + " = " + "\"#{file}\"" + "\n" )
31
- cnt += 1
32
- end
33
- end
34
-
35
- #exec "./msvis"
36
-
37
- #File.unlink options_file
38
-
39
- #if moving_options_file
40
- # File.rename mv_options_file, options_file
41
- #end
42
-
@@ -1,25 +0,0 @@
1
- #!/usr/bin/ruby -w
2
-
3
- require 'spec/mzxml/parser'
4
- require 'spec/msrun'
5
- require 'rexml/document'
6
- include REXML
7
-
8
- if ARGV.size < 1
9
- puts "usage: #{File.basename(__FILE__)} file.mzXML ..."
10
- puts " outputs 'file.mzXML.timeIndex'"
11
- puts " which contains rows of:"
12
- puts " level scan_num time (if !msLevel1:) prec_mz prec_intensity"
13
- end
14
-
15
- # outputs rows of:
16
- # level scan_num time [precursor_mz precursor_intensity(if !msLevel1)]
17
-
18
- ARGV.each do |file|
19
- puts "READING: " + file
20
- outfile = file + '.timeIndex'
21
- obj = MS::MSRunIndex.new(file)
22
- puts "WRITING: " + outfile
23
- obj.to_index_file(outfile)
24
- end
25
-
@@ -1,67 +0,0 @@
1
- #!/usr/bin/ruby -w
2
-
3
- require 'generator'
4
- require 'optparse'
5
-
6
- require 'fasta'
7
- require 'sample_enzyme'
8
- require 'spec_id/digestor'
9
- require 'spec_id/mass'
10
- require 'vec'
11
-
12
- opt = {}
13
- opt[:missed_cleavages] = 0 # ~ parts per million
14
- opt[:bin_size] = 0.001 # ~ parts per million
15
- opt[:min] = 300.0
16
- opt[:max] = 4500.0
17
- opt[:h_plus] = 1.0
18
-
19
- opts = OptionParser.new do |op|
20
- op.banner = "usage: #{File.basename(__FILE__)} *.fasta"
21
- op.separator "Outputs a close estimate of number of peptides per bin."
22
- op.separator "Uses m+H+ as the peptide mass."
23
- op.separator "[for speed, assumes that there is a peptide mass close to the extremes]"
24
- op.on("-b", "--bin_size <F>", Float, "size of bins [#{opt[:bin_size]}]") {|v| opt[:bin_size] = v }
25
- op.on("-x", "--max <F>", Float, "max mass to accept [#{opt[:max]}]") {|v| opt[:max] = v }
26
- op.on("-n", "--min <F>", Float, "min mass to accept [#{opt[:min]}]") {|v| opt[:min] = v }
27
- op.on("-h", "--h_plus <F>", Float, "value of H+ to use [#{opt[:h_plus]}]") {|v| opt[:h_plus] = v }
28
- op.on("-m", "--missed_cleavages <N>", Integer, "num missed cleavages [#{opt[:missed_cleavages]}]") {|v| opt[:missed_cleavages] = v }
29
- end
30
-
31
- opts.parse!
32
-
33
- if ARGV.size == 0
34
- puts opts.to_s
35
- exit
36
- end
37
-
38
- min_mass = opt[:min]
39
- max_mass = opt[:max]
40
-
41
- ARGV.each do |file|
42
- fasta = Fasta.new(file)
43
- uniq_aaseqs = fasta.map do |prot|
44
- SampleEnzyme.tryptic(prot.aaseq, opt[:missed_cleavages])
45
- end.flatten.uniq
46
-
47
- masses = Mass::Calculator.new(Mass::MONO, opt[:h_plus]).masses(uniq_aaseqs)
48
- passing_masses = Mass::Calculator.new(Mass::MONO, opt[:h_plus]).masses(uniq_aaseqs).select do |mh|
49
- ((mh >= min_mass) and (mh <= max_mass))
50
- end
51
-
52
- ## warn if the masses aren't close to the end points
53
- if (max_mass - passing_masses.max) > 1.0
54
- warn "highest mass is not that close to max: #{passing_masses.max}"
55
- end
56
- if (passing_masses.min - min_mass) > 1.0
57
- warn "lowest mass is not that close to min: #{passing_masses.min}"
58
- end
59
-
60
- num_bins = (max_mass - min_mass) / opt[:bin_size]
61
-
62
- (bins, freqs) = VecD.new(passing_masses).histogram(num_bins)
63
-
64
- # report
65
- puts "#{file}: #{freqs.avg}"
66
-
67
- end
data/script/prep_dir.rb DELETED
@@ -1,121 +0,0 @@
1
- #!/usr/bin/ruby -w
2
-
3
-
4
- ## Assumes the same base
5
- module Kernel
6
-
7
- @@remove_raw = [/flush/, /equil/, /To_sequest/, /to_sequest/, /TempSequence/]
8
- @@seqext = '.sequest.zip'
9
- @@rawext = ['.RAW.zip', '.raw.zip']
10
-
11
- ## gets the basename of a file like this filename.RAW.zip or filename.raw.zip
12
- def get_basename(zip_file)
13
- basename = ""
14
- try1 = File.basename(zip_file, @@rawext[0])
15
- try2 = File.basename(zip_file, @@rawext[1])
16
-
17
- if try1.size < try2.size
18
- basename = try1
19
- elsif try1.size > try2.size
20
- basename = try2
21
- else #they are equal
22
- puts "something wrong at the basename"
23
- exit(1)
24
- end
25
- basename
26
- end
27
- def remove_extra_raw
28
- Dir.new(Dir.getwd).each do |test|
29
- @@remove_raw.each do |try|
30
- if test =~ try
31
- puts "removing " + test
32
- File.unlink test
33
- end
34
- end
35
- end
36
- end
37
-
38
- def raw2mzXML
39
- system "raw2mzXML.pl *.RAW"
40
- end
41
-
42
- def get_sequest_params(seqfile)
43
- unless File.exist?(seqfile)
44
- puts "couldn't find #{seqfile}"
45
- exit
46
- end
47
- basename = get_seq_basename(seqfile)
48
- extracted = basename + "/sequest.params"
49
- system "unzip #{seqzip} #{extracted}"
50
- return extracted
51
- end
52
-
53
- def get_seq_basename(file)
54
- File.basename(file, @@seqext)
55
- end
56
-
57
- end
58
-
59
- if ARGV.size < 1
60
- puts "usage: #{File.basename(__FILE__)} file.raw.zip"
61
- puts "This is specific to Peng's data to prepare it for OPD"
62
- exit
63
- end
64
-
65
- rawfiles = []
66
- seqfiles = []
67
- ARGV.each do |try|
68
- if try =~ /\.raw\.zip/
69
- rawfiles.push(try)
70
- elsif try =~ /\.sequest\.zip/
71
- seqfiles.push(try)
72
- else
73
- puts "skipping " + try
74
- end
75
- end
76
-
77
-
78
- ## depends on them being alphebetical
79
- (0..(rawfiles.size)).each do |cnt|
80
- rawfile = rawfiles[cnt]
81
- seqfile = seqfiles[cnt]
82
- break unless rawfile
83
- raw_basename = get_basename(rawfile)
84
- system("unzip #{rawfile}")
85
- puts "Basename: " + raw_basename
86
- current_dir = Dir.getwd
87
- unless Dir.chdir(raw_basename)
88
- puts "can't change to #{raw_basename}"
89
- exit
90
- end
91
- remove_extra_raw
92
- raw2mzXML
93
- system("mkdir raw")
94
- system("mkdir mzxml")
95
- system('mv *.RAW raw/')
96
- system('mv *.mzXML mzxml/')
97
- Dir.chdir(current_dir)
98
- rawzip = raw_basename + '.raw.zip'
99
- mzxmlzip = raw_basename + '.mzxml.zip'
100
- system("zip -r #{rawzip} #{raw_basename}/raw/*")
101
- system("zip -r #{mzxmlzip} #{raw_basename}/mzxml/*")
102
- system("mv #{rawzip} #{raw_basename}")
103
- system("mv #{mzxmlzip} #{raw_basename}")
104
- Dir.chdir(raw_basename)
105
- if (Dir.glob("*.zip").size == 2)
106
- system("rm -rf raw")
107
- system("rm -rf mzxml")
108
- end
109
- Dir.chdir current_dir
110
-
111
- ## get the sequest.params file:
112
- extracted = get_sequest_params(seqfile)
113
- system("mv #{extracted} #{raw_basename}")
114
-
115
- ## move the sequest file in
116
- system("chmod 664 #{seqfile}")
117
- system("mv #{seqfile} #{raw_basename}")
118
-
119
- end
120
-
121
-
@@ -1,27 +0,0 @@
1
- #!/usr/bin/ruby -w
2
-
3
- require 'fasta'
4
- require 'sample_enzyme'
5
-
6
- if ARGV.size < 2
7
- puts "usage: #{File.basename(__FILE__)} missed_cleavages <file>.fasta ..."
8
- puts " returns <file>.missed_cleavages_<missed_cleavages>.peptides"
9
- abort
10
- end
11
-
12
- missed_cleavages = ARGV.shift.to_i
13
-
14
- ARGV.each do |file|
15
-
16
- if file !~ /\.fasta/
17
- abort "must be a fasta file with extension fasta"
18
- end
19
- new_filename = file.sub(/\.fasta$/, '')
20
- new_filename << ".missed_cleavages_#{missed_cleavages}.peptides"
21
- File.open(new_filename, "w") do |fh|
22
- peptides = []
23
- Fasta.new.read_file(file).prots.each do |prot|
24
- fh.puts( prot.header.split(/\s+/).first.sub(/^>/,'') + "\t" + SampleEnzyme.tryptic(prot.aaseq, missed_cleavages).join(" ") )
25
- end
26
- end
27
- end
@@ -1,103 +0,0 @@
1
- #!/usr/bin/ruby -w
2
-
3
- require 'spec_id'
4
- require 'fasta'
5
- require 'optparse'
6
-
7
- $top = false
8
- opts = OptionParser.new do |op|
9
- op.banner = "usage: #{File.basename(__FILE__)} bioworks.xml <file>.fasta|prefix"
10
- op.separator "outputs stdout (tab del sorted by probability) probability, file:aaseq:charge T/F"
11
- op.separator "hashes on file+aaseq+charge"
12
- op.on("-t", "--top", "only top peptide (by prob) per scan+charge") do
13
- $top = true
14
- end
15
- end
16
-
17
- opts.parse!
18
-
19
- if ARGV.size < 2
20
- puts opts.to_s
21
- exit
22
- end
23
-
24
- specid_file = ARGV.shift
25
- file_or_prefix = ARGV.shift
26
-
27
- specid = SpecID.new(specid_file)
28
-
29
- indicator =
30
- if File.exist? file_or_prefix
31
- Fasta.new.read_file(file_or_prefix)
32
- else
33
- file_or_prefix
34
- end
35
-
36
-
37
- # returns an array containing the min prob peptides (in case of a tie)
38
- def lowest_peps(ar)
39
- min_prob = ar.min {|a,b| a.probability.to_f <=> b.probability.to_f }.probability.to_f
40
- ar.select {|v| v.probability.to_f == min_prob }
41
- end
42
-
43
- peps = specid.peps
44
- if $top
45
- top_by_scan = []
46
- peps.hash_by(:base_name, :first_scan).each do |k,v|
47
- low_peps = lowest_peps(v)
48
- top_by_scan.push( *low_peps )
49
- end
50
- end
51
-
52
- results = top_by_scan.hash_by(:base_name, :aaseq, :charge).map do |k,v|
53
- low_peps = lowest_peps(v)
54
- #min_pep = v.min {|a,b| a.probability.to_f <=> b.probability.to_f }
55
- all_prots = []
56
- low_peps.each do |pep|
57
- all_prot_references.push( *(pep.prots.map {|v| v.reference }) )
58
- end
59
- all_prot_references.uniq!
60
- is_true =
61
- if indicator.is_a? Fasta
62
- all_prot_references.any? do |ref|
63
- indicator.included_in_header?(ref)
64
- end
65
- else
66
- !(all_prot_references.all? {|ref| ref.include?( indicator )})
67
- end
68
- [min_pep.probability.to_f, k, is_true]
69
- end
70
-
71
- results.sort.each do |result|
72
- report = [result[0], result[1].join(':'), (result[2] ? 'T' : 'F')]
73
- puts report.join("\t")
74
- end
75
-
76
- =begin
77
- # ORIGINAL CODE
78
- peps = specid.peps
79
- if $top
80
- peps = peps.hash_by(:base_name, :first_scan).map do |k,v|
81
- v.min {|a,b| a.probability.to_f <=> b.probability.to_f }
82
- end
83
- end
84
-
85
- results = peps.hash_by(:base_name, :aaseq, :charge).map do |k,v|
86
- min_pep = v.min {|a,b| a.probability.to_f <=> b.probability.to_f }
87
- references = min_pep.prots.map {|v| v.reference }.uniq
88
- is_true =
89
- if indicator.is_a? Fasta
90
- references.any? do |ref|
91
- indicator.included_in_header?(ref)
92
- end
93
- else
94
- !(references.all? {|ref| ref.include?( indicator )})
95
- end
96
- [min_pep.probability.to_f, k, is_true]
97
- end
98
-
99
- results.sort.each do |result|
100
- report = [result[0], result[1].join(':'), (result[2] ? 'T' : 'F')]
101
- puts report.join("\t")
102
- end
103
- =end
@@ -1,24 +0,0 @@
1
- #!/usr/bin/ruby -s
2
-
3
- require 'optparse'
4
-
5
- $outfile = 'meta.sqm'
6
- opts = OptionParser.new do |op|
7
- op.banner = "usage: #{File.basename(__FILE__)} <file>.sqt ..."
8
- op.separator "outputs meta.sqm (a sqt meta file)"
9
- op.on("-o", "--outfile <file>", "currently: #{$outfile}") {|v| $outfile = v}
10
- end
11
-
12
- opts.parse!
13
-
14
- if ARGV.size == 0
15
- puts opts.to_s
16
- exit
17
- end
18
-
19
- File.open($outfile, 'w') do |out|
20
- ARGV.each do |file|
21
- out.puts File.expand_path(file)
22
- end
23
- end
24
-
@@ -1,67 +0,0 @@
1
- #!/usr/bin/ruby -w
2
-
3
- ###################################################################
4
- cats = %w(base_name sequence xcorr deltacn first_scan last_scan)
5
- ###################################################################
6
-
7
- require 'spec_id'
8
- require 'hash_by'
9
-
10
- extension_top = '.top_per_scan.txt'
11
- extension_all = '.all_peps_per_scan.txt'
12
-
13
- if ARGV.size < 1
14
- puts "usage: #{File.basename(__FILE__)} <file>.xml"
15
- puts "output: <file>#{extension}"
16
- puts ""
17
- puts "Generates top hit (highest xcorr) per scan."
18
- exit
19
- end
20
-
21
- def print_doc(outfile, headers, table_a_of_a)
22
- document = table_a_of_a.map do |line|
23
- line.join("\t")
24
- end.join("\n")
25
- File.open(outfile, 'w') do |out|
26
- out.print headers.join("\t") + "\n"
27
- out.print document
28
- end
29
- end
30
-
31
-
32
- def pep_array_to_table(peps, send_to)
33
- arr_of_arr = peps.map do |pep|
34
- arr = send_to.map {|sym| pep.send(sym) }
35
- arr.unshift( pep.prot.reference ) # hacked on
36
- end
37
- end
38
-
39
- ###############################################
40
- # MAIN:
41
- ###############################################
42
-
43
- file = ARGV[0]
44
- outfile_top = file.sub(/\.xml$/, extension_top)
45
- outfile_all = file.sub(/\.xml$/, extension_all)
46
-
47
- sp = SpecID.new(file)
48
-
49
- # The old (incorrect version)
50
- # pep_hash = sp.peps.hash_by(:first_scan, :last_scan)
51
- # The correct version:
52
- pep_hash = sp.peps.hash_by(:base_name, :first_scan, :last_scan)
53
- top_per_scan = pep_hash.map {|k,v| v.sort_by {|ob| ob.xcorr.to_f }.last }
54
- top_per_scan = top_per_scan.sort_by {|pep| pep.first_scan.to_i }
55
-
56
- all_peps = sp.peps.sort_by do |pep| [pep.first_scan.to_i, -1.0 * pep.xcorr.to_f] end
57
-
58
- cats_sym = cats.map {|v| v.to_sym }
59
-
60
- a_of_a_top = pep_array_to_table(top_per_scan, cats_sym)
61
- a_of_a_all = pep_array_to_table(all_peps, cats_sym)
62
-
63
- cats.unshift "protein_reference"
64
-
65
- print_doc(outfile_top, cats, a_of_a_top)
66
- print_doc(outfile_all, cats, a_of_a_all)
67
-
@@ -1,47 +0,0 @@
1
- #!/usr/bin/ruby -w
2
-
3
-
4
- require 'optparse'
5
-
6
- opt = {}
7
- opt[:probability] = 1.0
8
- opts = OptionParser.new do |op|
9
- op.banner = "USAGE: #{File.basename(__FILE__)} toppred.out"
10
- op.separator "Outputs toppred.yaml"
11
- op.separator "takes the highest probability structure"
12
- op.separator "for best structures of equal probability, takes first given"
13
- op.separator "Each line contains:"
14
- op.separator "<identifier>: String :"
15
- op.separator " num_found: Int"
16
- op.separator " num_certain_transmembrane_segments: Int"
17
- op.separator " num_putative_transmembrane_segments: Int"
18
- op.separator " best_structure_probability: Float"
19
- op.separator " transmembrane_segments:"
20
- op.separator " - probability: Float"
21
- op.separator " start: Int"
22
- op.separator " stop: Int"
23
- op.separator " aaseq: String"
24
- op.separator ""
25
- op.separator "OPTIONS:"
26
- op.on("-p", "--probability", Float, "min structure prob threshold (default #{opt[:probability]})") {|v| opt[:probability] = v}
27
- end
28
-
29
- opts.parse!
30
-
31
-
32
- if ARGV.size == 0
33
- puts opts
34
- exit
35
- end
36
-
37
- file = ARGV.shift
38
-
39
- File.open(file) do |fh|
40
- hash = Transmem.read_toppred(fh)
41
- end
42
-
43
- puts hash.to_yaml
44
-
45
-
46
-
47
-