mspire 0.2.4 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (233) hide show
  1. data/INSTALL +1 -0
  2. data/README +25 -0
  3. data/Rakefile +129 -40
  4. data/bin/{find_aa_freq.rb → aafreqs.rb} +2 -2
  5. data/bin/bioworks_to_pepxml.rb +1 -0
  6. data/bin/fasta_shaker.rb +1 -96
  7. data/bin/filter_and_validate.rb +5 -0
  8. data/bin/{mzxml_to_lmat.rb → ms_to_lmat.rb} +8 -7
  9. data/bin/prob_validate.rb +6 -0
  10. data/bin/raw_to_mzXML.rb +2 -2
  11. data/bin/srf_group.rb +1 -0
  12. data/bin/srf_to_sqt.rb +40 -0
  13. data/changelog.txt +68 -0
  14. data/lib/align/chams.rb +6 -6
  15. data/lib/align.rb +4 -3
  16. data/lib/bsearch.rb +120 -0
  17. data/lib/fasta.rb +318 -86
  18. data/lib/group_by.rb +10 -0
  19. data/lib/index_by.rb +11 -0
  20. data/lib/merge_deep.rb +21 -0
  21. data/lib/{spec → ms/converter}/mzxml.rb +77 -109
  22. data/lib/ms/gradient_program.rb +171 -0
  23. data/lib/ms/msrun.rb +209 -0
  24. data/lib/{spec/msrun.rb → ms/msrun_index.rb} +7 -40
  25. data/lib/ms/parser/mzdata/axml.rb +12 -0
  26. data/lib/ms/parser/mzdata/dom.rb +160 -0
  27. data/lib/ms/parser/mzdata/libxml.rb +7 -0
  28. data/lib/ms/parser/mzdata.rb +25 -0
  29. data/lib/ms/parser/mzxml/axml.rb +11 -0
  30. data/lib/ms/parser/mzxml/dom.rb +159 -0
  31. data/lib/ms/parser/mzxml/hpricot.rb +253 -0
  32. data/lib/ms/parser/mzxml/libxml.rb +15 -0
  33. data/lib/ms/parser/mzxml/regexp.rb +122 -0
  34. data/lib/ms/parser/mzxml/rexml.rb +72 -0
  35. data/lib/ms/parser/mzxml/xmlparser.rb +248 -0
  36. data/lib/ms/parser/mzxml.rb +175 -0
  37. data/lib/ms/parser.rb +108 -0
  38. data/lib/ms/precursor.rb +10 -0
  39. data/lib/ms/scan.rb +81 -0
  40. data/lib/ms/spectrum.rb +193 -0
  41. data/lib/ms.rb +10 -0
  42. data/lib/mspire.rb +4 -0
  43. data/lib/roc.rb +61 -1
  44. data/lib/sample_enzyme.rb +31 -8
  45. data/lib/scan_i.rb +21 -0
  46. data/lib/spec_id/aa_freqs.rb +7 -3
  47. data/lib/spec_id/bioworks.rb +20 -14
  48. data/lib/spec_id/digestor.rb +139 -0
  49. data/lib/spec_id/mass.rb +116 -0
  50. data/lib/spec_id/parser/proph.rb +236 -0
  51. data/lib/spec_id/precision/filter/cmdline.rb +209 -0
  52. data/lib/spec_id/precision/filter/interactive.rb +134 -0
  53. data/lib/spec_id/precision/filter/output.rb +147 -0
  54. data/lib/spec_id/precision/filter.rb +623 -0
  55. data/lib/spec_id/precision/output.rb +60 -0
  56. data/lib/spec_id/precision/prob/cmdline.rb +139 -0
  57. data/lib/spec_id/precision/prob/output.rb +88 -0
  58. data/lib/spec_id/precision/prob.rb +171 -0
  59. data/lib/spec_id/proph/pep_summary.rb +92 -0
  60. data/lib/spec_id/proph/prot_summary.rb +484 -0
  61. data/lib/spec_id/proph.rb +2 -466
  62. data/lib/spec_id/protein_summary.rb +2 -2
  63. data/lib/spec_id/sequest/params.rb +316 -0
  64. data/lib/spec_id/sequest/pepxml.rb +1513 -0
  65. data/lib/spec_id/sequest.rb +2 -1672
  66. data/lib/spec_id/srf.rb +445 -177
  67. data/lib/spec_id.rb +183 -95
  68. data/lib/spec_id_xml.rb +8 -10
  69. data/lib/transmem/phobius.rb +147 -0
  70. data/lib/transmem/toppred.rb +368 -0
  71. data/lib/transmem.rb +157 -0
  72. data/lib/validator/aa.rb +135 -0
  73. data/lib/validator/background.rb +73 -0
  74. data/lib/validator/bias.rb +95 -0
  75. data/lib/validator/cmdline.rb +260 -0
  76. data/lib/validator/decoy.rb +94 -0
  77. data/lib/validator/digestion_based.rb +69 -0
  78. data/lib/validator/probability.rb +48 -0
  79. data/lib/validator/prot_from_pep.rb +234 -0
  80. data/lib/validator/transmem.rb +272 -0
  81. data/lib/validator/true_pos.rb +46 -0
  82. data/lib/validator.rb +214 -0
  83. data/lib/xml.rb +38 -0
  84. data/lib/xml_style_parser.rb +105 -0
  85. data/lib/xmlparser_wrapper.rb +19 -0
  86. data/script/compile_and_plot_smriti_final.rb +97 -0
  87. data/script/extract_gradient_programs.rb +56 -0
  88. data/script/get_apex_values_rexml.rb +44 -0
  89. data/script/mzXML2timeIndex.rb +1 -1
  90. data/script/smriti_final_analysis.rb +103 -0
  91. data/script/toppred_to_yaml.rb +47 -0
  92. data/script/tpp_installer.rb +1 -1
  93. data/{test/tc_align.rb → specs/align_spec.rb} +21 -27
  94. data/{test/tc_bioworks_to_pepxml.rb → specs/bin/bioworks_to_pepxml_spec.rb} +25 -41
  95. data/specs/bin/fasta_shaker_spec.rb +259 -0
  96. data/specs/bin/filter_and_validate__multiple_vals_helper.yaml +202 -0
  97. data/specs/bin/filter_and_validate_spec.rb +124 -0
  98. data/specs/bin/ms_to_lmat_spec.rb +34 -0
  99. data/specs/bin/prob_validate_spec.rb +62 -0
  100. data/specs/bin/protein_summary_spec.rb +10 -0
  101. data/{test/tc_fasta.rb → specs/fasta_spec.rb} +354 -310
  102. data/specs/gi_spec.rb +22 -0
  103. data/specs/load_bin_path.rb +7 -0
  104. data/specs/merge_deep_spec.rb +13 -0
  105. data/specs/ms/gradient_program_spec.rb +77 -0
  106. data/specs/ms/msrun_spec.rb +455 -0
  107. data/specs/ms/parser_spec.rb +92 -0
  108. data/specs/ms/spectrum_spec.rb +89 -0
  109. data/specs/roc_spec.rb +251 -0
  110. data/specs/rspec_autotest.rb +149 -0
  111. data/specs/sample_enzyme_spec.rb +41 -0
  112. data/specs/spec_helper.rb +133 -0
  113. data/specs/spec_id/aa_freqs_spec.rb +52 -0
  114. data/{test/tc_bioworks.rb → specs/spec_id/bioworks_spec.rb} +56 -71
  115. data/specs/spec_id/digestor_spec.rb +75 -0
  116. data/specs/spec_id/precision/filter/cmdline_spec.rb +20 -0
  117. data/specs/spec_id/precision/filter/output_spec.rb +31 -0
  118. data/specs/spec_id/precision/filter_spec.rb +243 -0
  119. data/specs/spec_id/precision/prob_spec.rb +111 -0
  120. data/specs/spec_id/precision/prob_spec_helper.rb +0 -0
  121. data/specs/spec_id/proph/pep_summary_spec.rb +143 -0
  122. data/{test/tc_proph.rb → specs/spec_id/proph/prot_summary_spec.rb} +52 -32
  123. data/{test/tc_protein_summary.rb → specs/spec_id/protein_summary_spec.rb} +85 -0
  124. data/specs/spec_id/sequest/params_spec.rb +68 -0
  125. data/specs/spec_id/sequest/pepxml_spec.rb +452 -0
  126. data/specs/spec_id/sqt_spec.rb +138 -0
  127. data/specs/spec_id/srf_spec.rb +209 -0
  128. data/specs/spec_id/srf_spec_helper.rb +302 -0
  129. data/specs/spec_id_helper.rb +33 -0
  130. data/specs/spec_id_spec.rb +361 -0
  131. data/specs/spec_id_xml_spec.rb +33 -0
  132. data/specs/transmem/phobius_spec.rb +423 -0
  133. data/specs/transmem/toppred_spec.rb +297 -0
  134. data/specs/transmem_spec.rb +60 -0
  135. data/specs/transmem_spec_shared.rb +64 -0
  136. data/specs/validator/aa_spec.rb +107 -0
  137. data/specs/validator/background_spec.rb +51 -0
  138. data/specs/validator/bias_spec.rb +146 -0
  139. data/specs/validator/decoy_spec.rb +51 -0
  140. data/specs/validator/fasta_helper.rb +26 -0
  141. data/specs/validator/prot_from_pep_spec.rb +141 -0
  142. data/specs/validator/transmem_spec.rb +145 -0
  143. data/specs/validator/true_pos_spec.rb +58 -0
  144. data/specs/validator_helper.rb +33 -0
  145. data/specs/xml_spec.rb +12 -0
  146. data/test_files/000_pepxml18_small.xml +206 -0
  147. data/test_files/020a.mzXML.timeIndex +4710 -0
  148. data/test_files/4-03-03_mzXML/000.mzXML.timeIndex +3973 -0
  149. data/test_files/4-03-03_mzXML/020.mzXML.timeIndex +3872 -0
  150. data/test_files/4-03-03_small-prot.xml +321 -0
  151. data/test_files/4-03-03_small.xml +3876 -0
  152. data/test_files/7MIX_STD_110802_1.sequest_params_fragment.srf +0 -0
  153. data/test_files/bioworks-3.3_10prots.xml +5999 -0
  154. data/test_files/bioworks31.params +77 -0
  155. data/test_files/bioworks32.params +62 -0
  156. data/test_files/bioworks33.params +63 -0
  157. data/test_files/bioworks_single_run_small.xml +7237 -0
  158. data/test_files/bioworks_small.fasta +212 -0
  159. data/test_files/bioworks_small.params +63 -0
  160. data/test_files/bioworks_small.phobius +109 -0
  161. data/test_files/bioworks_small.toppred.out +2847 -0
  162. data/test_files/bioworks_small.xml +5610 -0
  163. data/test_files/bioworks_with_INV_small.xml +3753 -0
  164. data/test_files/bioworks_with_SHUFF_small.xml +2503 -0
  165. data/test_files/corrupted_900.srf +0 -0
  166. data/test_files/head_of_7MIX.srf +0 -0
  167. data/test_files/interact-opd1_mods_small-prot.xml +304 -0
  168. data/test_files/messups.fasta +297 -0
  169. data/test_files/opd1/000.my_answer.100lines.xml +101 -0
  170. data/test_files/opd1/000.tpp_1.2.3.first10.xml +115 -0
  171. data/test_files/opd1/000.tpp_2.9.2.first10.xml +126 -0
  172. data/test_files/opd1/000.v2.1.mzXML.timeIndex +3748 -0
  173. data/test_files/opd1/000_020-prot.png +0 -0
  174. data/test_files/opd1/000_020_3prots-prot.mod_initprob.xml +62 -0
  175. data/test_files/opd1/000_020_3prots-prot.xml +62 -0
  176. data/test_files/opd1/opd1_cat_inv_small-prot.xml +139 -0
  177. data/test_files/opd1/sequest.3.1.params +77 -0
  178. data/test_files/opd1/sequest.3.2.params +62 -0
  179. data/test_files/opd1/twenty_scans.mzXML +418 -0
  180. data/test_files/opd1/twenty_scans.v2.1.mzXML +382 -0
  181. data/test_files/opd1/twenty_scans_answ.lmat +0 -0
  182. data/test_files/opd1/twenty_scans_answ.lmata +9 -0
  183. data/test_files/opd1_020_beginning.RAW +0 -0
  184. data/test_files/opd1_2runs_2mods/interact-opd1_mods__small.xml +753 -0
  185. data/test_files/orbitrap_mzData/000_cut.xml +1920 -0
  186. data/test_files/pepproph_small.xml +4691 -0
  187. data/test_files/phobius.small.noheader.txt +50 -0
  188. data/test_files/phobius.small.small.txt +53 -0
  189. data/test_files/s01_anC1_ld020mM.key.txt +25 -0
  190. data/test_files/s01_anC1_ld020mM.meth +0 -0
  191. data/test_files/small.fasta +297 -0
  192. data/test_files/smallraw.RAW +0 -0
  193. data/test_files/tf_bioworks2excel.bioXML +14340 -0
  194. data/test_files/tf_bioworks2excel.txt.actual +1035 -0
  195. data/test_files/toppred.small.out +416 -0
  196. data/test_files/toppred.xml.out +318 -0
  197. data/test_files/validator_hits_separate/bias_bioworks_small_HS.fasta +7 -0
  198. data/test_files/validator_hits_separate/bioworks_small_HS.xml +5651 -0
  199. data/test_files/yeast_gly_small-prot.xml +265 -0
  200. data/test_files/yeast_gly_small.1.0_1.0_1.0.parentTimes +6 -0
  201. data/test_files/yeast_gly_small.xml +3807 -0
  202. data/test_files/yeast_gly_small2.parentTimes +6 -0
  203. metadata +273 -57
  204. data/bin/filter.rb +0 -6
  205. data/bin/precision.rb +0 -5
  206. data/lib/spec/mzdata/parser.rb +0 -108
  207. data/lib/spec/mzdata.rb +0 -48
  208. data/lib/spec/mzxml/parser.rb +0 -449
  209. data/lib/spec/scan.rb +0 -55
  210. data/lib/spec_id/filter.rb +0 -797
  211. data/lib/spec_id/precision.rb +0 -421
  212. data/lib/toppred.rb +0 -18
  213. data/script/filter-peps.rb +0 -164
  214. data/test/tc_aa_freqs.rb +0 -59
  215. data/test/tc_fasta_shaker.rb +0 -149
  216. data/test/tc_filter.rb +0 -203
  217. data/test/tc_filter_peps.rb +0 -46
  218. data/test/tc_gi.rb +0 -17
  219. data/test/tc_id_class_anal.rb +0 -70
  220. data/test/tc_id_precision.rb +0 -89
  221. data/test/tc_msrun.rb +0 -88
  222. data/test/tc_mzxml.rb +0 -88
  223. data/test/tc_mzxml_to_lmat.rb +0 -36
  224. data/test/tc_peptide_parent_times.rb +0 -27
  225. data/test/tc_precision.rb +0 -60
  226. data/test/tc_roc.rb +0 -166
  227. data/test/tc_sample_enzyme.rb +0 -32
  228. data/test/tc_scan.rb +0 -26
  229. data/test/tc_sequest.rb +0 -336
  230. data/test/tc_spec.rb +0 -78
  231. data/test/tc_spec_id.rb +0 -201
  232. data/test/tc_spec_id_xml.rb +0 -36
  233. data/test/tc_srf.rb +0 -262
data/lib/spec_id/srf.rb CHANGED
@@ -1,7 +1,12 @@
1
+ require 'spec_id'
1
2
  require 'spec_id/sequest'
3
+ require 'fasta'
4
+ require 'mspire'
5
+ require 'set'
6
+ require 'fasta'
2
7
 
3
8
  module BinaryReader
4
- Null_char = "\0"[0] ## change for ruby 1.9 or 2.0
9
+ Null_char = "\0"[0] ## TODO: change for ruby 1.9 or 2.0
5
10
  # extracts a string with all empty chars at the end stripped
6
11
  # expects the filehandle to be at the proper location
7
12
  def get_null_padded_string(fh,bytes)
@@ -16,62 +21,6 @@ module BinaryReader
16
21
  end
17
22
 
18
23
  # class to extract information from <file>_dta.log files
19
- class DTALog
20
- # returns an array indexed by the dta file number (starting at 0)
21
- # each entry is an array [first_scan, last_scan, dta_filename_noext]
22
- # this is now obsolete since I found the scan # index at the end of the srf
23
- # files
24
- def self.dta_and_scans_by_dta_index(file)
25
- dta_index = nil
26
- final_scan = nil
27
- dta_cnt = 0
28
- re = /^ m/o
29
- scan_line_re = /scan: (\d+) - (\d+), Datafile: (.*?) (.*)/o
30
- other_dta_re = /Datafile: (.*?) /o
31
- File.open(file) do |fh|
32
- 10.times { fh.readline }
33
- scan_range_line = fh.readline
34
- if scan_range_line =~ /scan range\s+= \d+ - (\d+)/
35
- # this is an overestimate (since MS scans have no dta, but that's OK)
36
- dta_index = Array.new($1.to_i)
37
- else
38
- dta_index = []
39
- end
40
- 3.times { fh.readline }
41
- fh.each do |line|
42
- if line =~ re
43
- if line =~ scan_line_re
44
- first_scan = $1.to_i
45
- last_scan = $2.to_i
46
- the_rest = $4.dup
47
- dta_index[dta_cnt] = [first_scan, last_scan, $3.sub(/\.dta/,'')]
48
- dta_cnt += 1
49
- if the_rest =~ other_dta_re
50
- dta_index[dta_cnt] = [first_scan, last_scan, $1.sub(/\.dta/,'')]
51
- dta_cnt += 1
52
- end
53
- end
54
- break
55
- end
56
- end
57
- fh.each do |line|
58
- if line =~ scan_line_re
59
- first_scan = $1.to_i
60
- last_scan = $2.to_i
61
- the_rest = $4.dup
62
- dta_index[dta_cnt] = [first_scan, last_scan, $3.sub(/\.dta/,'')]
63
- dta_cnt += 1
64
- if the_rest =~ other_dta_re
65
- dta_index[dta_cnt] = [first_scan, last_scan, $1.sub(/\.dta/,'')]
66
- dta_cnt += 1
67
- end
68
- end
69
- end
70
- end
71
- dta_index.compact! # remove those trailing nils
72
- dta_index
73
- end
74
- end
75
24
 
76
25
  class SRFGroup
77
26
  include SpecID
@@ -83,12 +32,15 @@ class SRFGroup
83
32
  # takes an array of filenames
84
33
  # or a single .srg filename
85
34
  # see from_srg to load a single .srg file
86
- def initialize(filenames=nil)
35
+ # by default, the hits will be returned filtered by sequest params values.
36
+ # [The raw SRF data is unfiltered!]
37
+ def initialize(filenames=nil, filter_hits_by_params=true)
87
38
  @filenames = filenames
88
39
  @peps = []
89
40
  @prots = []
90
- @global_ref_hash = {}
91
41
  @srfs = []
42
+
43
+ global_ref_hash = {}
92
44
  if filenames
93
45
  if filenames.is_a?(String) && filenames =~ /\.srg$/
94
46
  srg_filename = filenames.dup
@@ -103,16 +55,99 @@ class SRFGroup
103
55
  end
104
56
  end
105
57
  filenames.each do |file|
106
- @srfs << SRF.new(file, @peps, @global_ref_hash)
58
+ @srfs << SRF.new(file, @peps, global_ref_hash)
59
+ end
60
+ @prots = global_ref_hash.values
61
+ if filter_hits_by_params
62
+ filter_by_peptide_mass_tolerance
63
+ end
64
+ end
65
+ end
66
+
67
+ # if srfs were read in separately, then the proteins will need to be merged
68
+ # by their reference
69
+ def merge_different_sets(srfs)
70
+ raise NotImplementedError, "need to implement?"
71
+ end
72
+
73
+ # 1. sets @prots and returns it: a new list of proteins based on which
74
+ # peptides passed.
75
+ # 2. updates the out_file's list of hits based on passing peptides (but not
76
+ # the original hit id; rank is implicit in array ordering)
77
+ # 3. updates each protein to only include peptides passing thresholds.
78
+ # [Note, this process is how .out files are generated!]
79
+ # 4. recalculates deltacn values completely if number of hits changed (does
80
+ # not touch deltacn orig)
81
+ # ASSUMES:
82
+ # A. all srfs have identical params objects and each has a
83
+ # peptide_mass_tolerance filter attribute.
84
+ # B. proteins are already unique (peptides referencing the same protein
85
+ # reference the same object already) In practice, this means all srfs were
86
+ # read in together.
87
+ def filter_by_peptide_mass_tolerance
88
+ prots_in_set = Set.new
89
+ params = @srfs.first.params
90
+ pmt = params.peptide_mass_tolerance.to_f
91
+ methd = nil # the method to
92
+
93
+ case params.peptide_mass_units
94
+ when '0'
95
+ amu_based = true
96
+ milli_amu = false
97
+ when '1'
98
+ amu_based = true
99
+ milli_amu = true
100
+ when '2'
101
+ amu_based = false
102
+ end
103
+
104
+ @srfs.each do |srf|
105
+ srf.out_files.each do |out_file|
106
+ hits = out_file.hits
107
+ before = hits.size
108
+ hits.reject! do |pep|
109
+ do_not_keep =
110
+ if amu_based
111
+ if milli_amu
112
+ (pep.deltamass.abs > (pmt/1000))
113
+ else
114
+ (pep.deltamass.abs > pmt)
115
+ end
116
+ else
117
+ (pep.ppm.abs > pmt)
118
+ end
119
+ unless do_not_keep
120
+ pep.prots.each do |prot|
121
+ if prots_in_set.include?(prot)
122
+ prot.peps << pep
123
+ else
124
+ prots_in_set.add(prot)
125
+ prot.peps = [pep]
126
+ end
127
+ end
128
+ end
129
+ do_not_keep
130
+ end
131
+ if hits.size != before
132
+ SRF::OUT::Pep.set_deltacn_from_xcorr(hits)
133
+ end
107
134
  end
108
135
  end
136
+ @prots = prots_in_set.to_a
137
+
109
138
  end
110
139
 
111
140
  # returns the filename used
141
+ # if the file exists, the name will be expanded to full path, otherwise just
142
+ # what is given
112
143
  def to_srg(srg_filename='bioworks.srg')
113
144
  File.open(srg_filename, 'w') do |v|
114
145
  @filenames.each do |srf_file|
115
- v.puts File.expand_path(srf_file)
146
+ if File.exist? srf_file
147
+ v.puts File.expand_path(srf_file)
148
+ else
149
+ v.puts srf_file
150
+ end
116
151
  end
117
152
  end
118
153
  srg_filename
@@ -132,15 +167,25 @@ class SRF
132
167
  # [first_scan, last_scan, charge]
133
168
  attr_accessor :index
134
169
  attr_accessor :base_name
170
+ # this is the global peptides array
171
+ attr_accessor :peps
172
+ # the global reference hash that allows...
173
+ attr_accessor :global_ref_hash
135
174
 
136
175
  def dta_start_byte
137
176
  case @version
138
177
  when '3.2' ; 3260
139
178
  when '3.3' ; 3644
179
+ when '3.5' ; 3644
140
180
  end
141
181
  end
142
182
 
143
- # peps and
183
+ # peps and global_ref_hash are created as the srf files is read. If the
184
+ # file is read as part of a group, then these should be passed in.
185
+ # NOTE: if you want the hits filtered by precursor tolerance (the way they
186
+ # might be displayed in .out files) you should probably use SRFGroup (which
187
+ # does this by default)
188
+ # SRF is meant to be a low level read of the file.
144
189
  def initialize(filename=nil, peps=[], global_ref_hash={})
145
190
  @dta_files = []
146
191
  @out_files = []
@@ -149,17 +194,203 @@ class SRF
149
194
  end
150
195
  end
151
196
 
197
+ def round(float, decimal_places)
198
+ sprintf("%.#{decimal_places}f", float)
199
+ end
200
+
201
+ # the out_filename will be the base_name + .sqt unless 'out_filename' is
202
+ # defined
203
+ # :round => round floating point numbers
204
+ # etc...
205
+ def to_sqt(out_filename=nil, opts={})
206
+ tic_dp = 2
207
+ mh_dp = 7
208
+ xcorr_dp = 5
209
+ sp_dp = 2
210
+ dcn_dp = 5
211
+
212
+ defaults = {:db_info=>false, :new_db_path=>nil, :update_db_path=>false, :round=>false}
213
+ opt = defaults.merge(opts)
214
+
215
+ outfile =
216
+ if out_filename
217
+ out_filename
218
+ else
219
+ base_name + '.sqt'
220
+ end
221
+ invariant_ordering = %w(SQTGenerator SQTGeneratorVersion Database FragmentMasses PrecursorMasses StartTime) # just for readability and consistency
222
+ fmt =
223
+ if params.fragment_mass_type == 'average' ; 'AVG'
224
+ else ; 'MONO'
225
+ end
226
+ pmt =
227
+ if params.precursor_mass_type == 'average' ; 'AVG'
228
+ else ; 'MONO'
229
+ end
230
+
231
+ mass_table = params.mass_table
232
+ static_mods = params.static_mods.map do |k,v|
233
+ key = k.split(/_/)[1]
234
+ if key.size == 1
235
+ key + '=' + (mass_table[key.to_sym] + v.to_f).to_s
236
+ else
237
+ key + '=' + v
238
+ end
239
+ end
240
+
241
+ dynamic_mods = []
242
+ header.modifications.scan(/\((.*?)\)/) do |match|
243
+ dynamic_mods << match.first.sub(/ /,'=')
244
+ end
245
+ plural = {
246
+ 'StaticMod' => static_mods,
247
+ 'DynamicMod' => dynamic_mods, # example as diff mod
248
+ 'Comment' => ['Created from Bioworks .srf file']
249
+ }
250
+
251
+
252
+ db_filename = header.db_filename
253
+ db_filename_in_sqt = db_filename
254
+ if opt[:new_db_path]
255
+ db_filename = File.join(opt[:new_db_path], File.basename(db_filename.gsub('\\', '/')))
256
+ if opt[:update_db_path]
257
+ db_filename_in_sqt = File.expand_path(db_filename)
258
+ warn "writing Database #{db_filename} to sqt, but it does not exist on this file system" unless File.exist?(db_filename)
259
+ end
260
+ end
261
+
262
+ apmu =
263
+ case params.peptide_mass_units
264
+ when '0' : 'amu'
265
+ when '1' : 'mmu'
266
+ when '2' : 'ppm'
267
+ end
268
+
269
+ hh = {
270
+ 'SQTGenerator' => 'mspire',
271
+ 'SQTGeneratorVersion' => Mspire::Version,
272
+ 'Database' => db_filename_in_sqt,
273
+ 'FragmentMasses' => fmt,
274
+ 'PrecursorMasses' => pmt,
275
+ 'StartTime' => '', # Bioworks 3.2 also leaves this blank...
276
+ 'Alg-PreMassTol' => params.peptide_mass_tolerance,
277
+ 'Alg-FragMassTol' => params.fragment_ion_tolerance,
278
+ 'Alg-PreMassUnits' => apmu, ## mine
279
+ 'Alg-IonSeries' => header.ion_series.split(':').last.lstrip,
280
+ 'Alg-Enzyme' => header.enzyme.split(':').last,
281
+ 'Alg-MSModel' => header.model,
282
+ }
283
+
284
+ if opt[:db_info]
285
+ if File.exist?(db_filename)
286
+ reply = get_db_info_for_sqt(db_filename)
287
+ %w(DBSeqLength DBLocusCount DBMD5Sum).zip(reply) do |label,val|
288
+ hh[label] = val
289
+ end
290
+ else
291
+ warn "file #{db_filename} does not exist, no extra db info in header!"
292
+ end
293
+ end
294
+
295
+ has_hits = (self.out_files.size > 0)
296
+ if has_hits
297
+ # somewhat redundant with above, but we can get this without a db present!
298
+ hh['DBLocusCount'] = self.out_files.first.db_locus_count
299
+ end
300
+
301
+ File.open(outfile, 'w') do |out|
302
+ # print the header:
303
+ invariant_ordering.each do |iv|
304
+ out.puts ['H', iv, hh.delete(iv)].join("\t")
305
+ end
306
+ hh.each do |k,v|
307
+ out.puts ['H', k, v].join("\t")
308
+ end
309
+ plural.each do |k,vals|
310
+ vals.each do |val|
311
+ out.puts ['H', k, val].join("\t")
312
+ end
313
+ end
314
+
315
+ ##### SPECTRA
316
+ time_to_process = '0.0'
317
+ #########################################
318
+ # NEED TO FIGURE OUT: (in spectra guy)
319
+ # * Lowest Sp value for top 500 spectra
320
+ # * Number of sequences matching this precursor ion
321
+ #########################################
322
+
323
+
324
+ manual_validation_status = 'U'
325
+ self.out_files.zip(dta_files) do |out_file, dta_file|
326
+ # don't have the time to process (using 0.0 like bioworks 3.2)
327
+ dta_file_mh = dta_file.mh
328
+ out_file_total_inten = out_file.total_inten
329
+ out_file_lowest_sp = out_file.lowest_sp
330
+ if opt[:round]
331
+ dta_file_mh = round(dta_file_mh, mh_dp)
332
+ out_file_total_inten = round(out_file_total_inten, tic_dp)
333
+ out_file_lowest_sp = round(out_file_lowest_sp, sp_dp)
334
+ end
335
+
336
+ out.puts ['S', out_file.first_scan, out_file.last_scan, out_file.charge, time_to_process, out_file.computer, dta_file_mh, out_file_total_inten, out_file_lowest_sp, out_file.num_matched_peptides].join("\t")
337
+ out_file.hits.each_with_index do |hit,index|
338
+ hit_mh = hit.mh
339
+ hit_deltacn_orig = hit.deltacn_orig
340
+ hit_xcorr = hit.xcorr
341
+ hit_sp = hit.sp
342
+ if opt[:round]
343
+ hit_mh = round(hit_mh, mh_dp)
344
+ hit_deltacn_orig = round(hit_deltacn_orig, dcn_dp)
345
+ hit_xcorr = round(hit_xcorr, xcorr_dp)
346
+ hit_sp = round(hit_sp, sp_dp)
347
+ end
348
+ # note that the rank is determined by the order..
349
+ out.puts ['M', index+1, hit.rsp, hit_mh, hit_deltacn_orig, hit_xcorr, hit_sp, hit.ions_matched, hit.ions_total, hit.sequence, manual_validation_status].join("\t")
350
+ hit.prots.each do |prot|
351
+ out.puts ['L', prot.first_entry].join("\t")
352
+ end
353
+ end
354
+ end
355
+ end # close the filehandle
356
+
357
+ end
358
+
359
+ # assumes the file exists and is readable
360
+ # returns [DBSeqLength, DBLocusCount, DBMD5Sum] or nil if no file
361
+ def get_db_info_for_sqt(dbfile)
362
+ fasta = Fasta.new(dbfile)
363
+ [fasta.aa_seq_length, fasta.size, fasta.md5_sum]
364
+ end
365
+
366
+
152
367
  # returns self
153
368
  def from_file(filename, peps, global_ref_hash)
154
369
 
155
370
  File.open(filename, "rb") do |fh|
156
371
  @header = SRF::Header.new.from_handle(fh)
157
372
  @version = @header.version
158
- @dta_files, measured_mhs = read_dta_files(fh,@header.num_dta_files)
159
- @out_files = read_out_files(fh,@header.num_dta_files, global_ref_hash, measured_mhs)
160
- @params = Sequest::Params.new.parse_handle(fh)
161
- fh.read(12) ## gap between last params entry and index
162
- @index = read_scan_index(fh,@header.num_dta_files)
373
+ unpack_35 = case @version
374
+ when '3.2'
375
+ false
376
+ when '3.3'
377
+ false
378
+ when '3.5'
379
+ true
380
+ end
381
+ @dta_files, measured_mhs = read_dta_files(fh,@header.num_dta_files, unpack_35)
382
+
383
+ @out_files = read_out_files(fh,@header.num_dta_files, global_ref_hash, measured_mhs, unpack_35)
384
+ if fh.eof?
385
+ warn "FILE: '#{filename}' appears to be an abortive run (no params in srf file)\nstill continuing..."
386
+ @params = nil
387
+ @index = []
388
+ else
389
+ @params = Sequest::Params.new.parse_handle(fh)
390
+ # This is very sensitive to the grab_params method in sequest params
391
+ fh.read(12) ## gap between last params entry and index
392
+ @index = read_scan_index(fh,@header.num_dta_files)
393
+ end
163
394
  end
164
395
 
165
396
  ### UPDATE SOME THINGS ON SINGLE PASS:
@@ -168,14 +399,15 @@ class SRF
168
399
  @index.each_with_index do |ind,i|
169
400
  mass_measured = @dta_files[i][0]
170
401
  #puts @out_files[i].join(", ")
171
- pep_hits = @out_files[i][3]
402
+ @out_files[i][0,3] = *ind
403
+ pep_hits = @out_files[i][6]
172
404
  peps.push( *pep_hits )
173
405
  pep_hits.each do |pep_hit|
174
- pep_hit[13,3] = @base_name, *ind
406
+ pep_hit[14,4] = @base_name, *ind
175
407
  # add the deltamass
176
- pep_hit[10] = pep_hit[0] - mass_measured # real - measured (deltamass)
177
- pep_hit[11] = 1.0e6 * pep_hit[10].abs / mass_measured ## ppm
178
- pep_hit[17] = self ## link with the srf object
408
+ pep_hit[11] = pep_hit[0] - mass_measured # real - measured (deltamass)
409
+ pep_hit[12] = 1.0e6 * pep_hit[11].abs / mass_measured ## ppm
410
+ pep_hit[18] = self ## link with the srf object
179
411
  end
180
412
  end
181
413
  self
@@ -195,27 +427,17 @@ class SRF
195
427
  index
196
428
  end
197
429
 
198
- # given a zero indexed list where each entry is [first_scan, last_scan,
199
- # dta_filename] updates the out info
200
- # returns self
201
- def update_out_scan_info_from_dta_log(dta_log)
202
- index = DTALog.dta_and_scans_by_dta_index(dta_log)
203
- @out_files.each_with_index do |ot,i|
204
- ot[4,3] = index[i] #contingent on implementation of ot
205
- end
206
- self
207
- end
208
-
209
430
  # returns an array of dta_files
210
- def read_dta_files(fh, num_files)
431
+ def read_dta_files(fh, num_files, unpack_35)
211
432
  measured_mhs = Array.new(num_files) ## A parallel array to capture the actual mh
212
433
  dta_files = Array.new(num_files)
213
434
  start = dta_start_byte
214
435
  unless fh.pos == start
215
436
  fh.pos = start
216
437
  end
438
+
217
439
  header.num_dta_files.times do |i|
218
- dta_file = SRF::DTA.new.from_handle(fh)
440
+ dta_file = SRF::DTA.new.from_handle(fh, unpack_35)
219
441
  measured_mhs[i] = dta_file[0]
220
442
  dta_files[i] = dta_file
221
443
  end
@@ -224,10 +446,10 @@ class SRF
224
446
 
225
447
  # filehandle (fh) must be at the start of the outfiles. 'read_dta_files'
226
448
  # will put the fh there.
227
- def read_out_files(fh,number_files, global_ref_hash, measured_mhs)
449
+ def read_out_files(fh,number_files, global_ref_hash, measured_mhs, unpack_35)
228
450
  out_files = Array.new(number_files)
229
451
  header.num_dta_files.times do |i|
230
- out_files[i] = SRF::OUT.new.from_handle(fh, global_ref_hash)
452
+ out_files[i] = SRF::OUT.new.from_handle(fh, global_ref_hash, unpack_35)
231
453
  end
232
454
  out_files
233
455
  end
@@ -326,31 +548,44 @@ class SRF::DTAGen
326
548
  end
327
549
  end
328
550
 
551
+ # total_num_possible_charge_states is not correct under 3.5 (Bioworks 3.3.1)
552
+ # unknown is, well unknown...
329
553
  SRF::DTA = ArrayClass.new(%w(mh dta_tic num_peaks charge ms_level unknown total_num_possible_charge_states peaks))
330
554
 
331
555
  class SRF::DTA
332
- Unpack = "EeIvvvv"
333
-
334
- # is this universal?
335
- First_record_start_byte = 3644
556
+ # original
557
+ # Unpack = "EeIvvvv"
558
+ Unpack_32 = "EeIvvvv"
559
+ Unpack_35 = "Ex8eVx2vvvv"
336
560
 
337
561
  # note on peaks (self[7])
338
562
  # this is a byte array of floats, you can get the peaks out with
339
563
  # unpack("e*")
340
564
 
565
+ undef_method :inspect
341
566
  def inspect
342
567
  peaks_st = 'nil'
343
568
  if self[7] ; peaks_st = "[#{self[7].size} bytes]" end
344
569
  "<SRF::DTA @mh=#{mh} @dta_tic=#{dta_tic} @num_peaks=#{num_peaks} @charge=#{charge} @ms_level=#{ms_level} @total_num_possible_charge_states=#{total_num_possible_charge_states} @peaks=#{peaks_st} >"
345
570
  end
346
571
 
347
- def from_handle(fh)
348
- st = fh.read(24)
572
+ def from_handle(fh, unpack_35)
573
+ if unpack_35
574
+ @unpack = Unpack_35
575
+ @read_header = 34
576
+ @read_spacer = 22
577
+ else
578
+ @unpack = Unpack_32
579
+ @read_header = 24
580
+ @read_spacer = 24
581
+ end
582
+
583
+ st = fh.read(@read_header)
349
584
  # get the bulk of the data in single unpack
350
- self[0,7] = st.unpack(Unpack)
585
+ self[0,7] = st.unpack(@unpack)
351
586
 
352
- # Scan numbers possibly hidden in this next sequence of bytes (I think)
353
- st2 = fh.read(24)
587
+ # Scan numbers are given at the end in an index!
588
+ st2 = fh.read(@read_spacer)
354
589
 
355
590
  num_bytes_to_read = num_peaks * 8
356
591
  st3 = fh.read(num_bytes_to_read)
@@ -360,135 +595,176 @@ class SRF::DTA
360
595
 
361
596
  end
362
597
 
363
- SRF::OUT = ArrayClass.new( %w(num_hits computer date_time hits) )
364
- # 0=num_hits 1=charge 2=computer 3=date_time 4=hits
598
+ SRF::OUT = ArrayClass.new( %w(first_scan last_scan charge num_hits computer date_time hits total_inten lowest_sp num_matched_peptides db_locus_count) )
599
+ # 0=first_scan, 1=last_scan, 2=charge, 3=num_hits, 4=computer, 5=date_time, 6=hits, 7=total_inten, 8=lowest_sp, 9=num_matched_peptides, 10=db_locus_count
365
600
 
366
601
  class SRF::OUT
367
- Unpack = '@36vx2Z*@60Z*'
602
+ Unpack_32 = '@36vx2Z*@60Z*'
603
+ Unpack_35 = '@36vx4Z*@62Z*'
368
604
 
605
+ undef_method :inspect
369
606
  def inspect
370
- if first_scan
371
- ins = "@first_scan=#{first_scan}, @last_scan=#{last_scan}, @filename_noext=#{filename_noext}, "
372
- end
373
- "<SRF::OUT @num_hits=#{num_hits}, @computer=#{computer}, @date_time=#{date_time}, #{ins}@hits=#{hits.inspect}>"
607
+ hits_s =
608
+ if self[6]
609
+ ", @hits(#)=#{hits.size}"
610
+ else
611
+ ''
612
+ end
613
+ "<SRF::OUT first_scan=#{first_scan}, last_scan=#{last_scan}, charge=#{charge}, num_hits=#{num_hits}, computer=#{computer}, date_time=#{date_time}#{hits_s}>"
374
614
  end
375
615
 
376
- def from_handle(fh, global_ref_hash)
616
+ def from_handle(fh, global_ref_hash, unpack_35)
377
617
  ## EMPTY out file is 96 bytes
378
618
  ## each hit is 320 bytes
379
619
  ## num_hits and charge:
380
620
  st = fh.read(96)
381
- self[0,3] = st.unpack(Unpack)
382
- num_hits = self[0]
621
+
622
+ self[3,3] = st.unpack( (unpack_35 ? Unpack_35 : Unpack_32) )
623
+ self[7,4] = st.unpack('@8eex4Ix4I')
624
+ num_hits = self[3]
383
625
 
384
626
  ar = Array.new(num_hits)
385
627
  if ar.size > 0
628
+ num_extra_references = 0
386
629
  num_hits.times do |i|
387
- ar[i] = SRF::OUT::Pep.new.from_handle(fh, global_ref_hash)
630
+ ar[i] = SRF::OUT::Pep.new.from_handle(fh, global_ref_hash, unpack_35)
631
+ num_extra_references += ar[i].num_other_loci
388
632
  end
633
+ SRF::OUT::Pep.read_extra_references(fh, num_extra_references, ar, global_ref_hash)
389
634
  ## The xcorrs are already ordered by best to worst hit
390
635
  ## ADJUST the deltacn's to be meaningful for the top hit:
391
636
  ## (the same as bioworks and prophet)
392
- (1...ar.size).each {|i| ar[i-1].deltacn = ar[i].deltacn }
393
- ar.last.deltacn = 1.1
637
+ SRF::OUT::Pep.set_deltacn_from_deltacn_orig(ar)
638
+ #puts ar.map {|a| a.deltacn }.join(", ")
394
639
  end
395
- self[3] = ar
396
-
640
+ self[6] = ar
397
641
  self
398
642
  end
399
643
 
644
+
645
+
400
646
  end
401
647
 
648
+
402
649
  # deltacn is modified to be that of the next best hit (by xcorr).
650
+ # deltacn_orig is the one that sequest originally reports
403
651
  # if there is no next best hit, then it will be 1.1 (like bioworks)
404
652
  # mh is the theoretical mass + h
405
653
  # prots are created as SRF prot objects with a reference and linked to their
406
654
  # peptides (from global hash by reference)
407
655
  # ppm = 10^6 * ∆m_accuracy / mass_measured [ where ∆m_accuracy = mass_real – mass_measured ]
408
656
  # This is calculated for the M+H mass!
657
+ # num_other_loci is the number of other loci that the peptide matches beyond
658
+ # the first one listed
409
659
  # srf = the srf object this scan came from
410
- SRF::OUT::Pep = ArrayClass.new(%w( mh deltacn sp xcorr id rsp ions_matched ions_total sequence prots deltamass ppm aaseq base_name first_scan last_scan charge srf) )
411
660
 
412
- # 0=mh 1=deltacn 2=sp 3=xcorr 4=id 5=rsp 6=ions_matched 7=ions_total 8=sequence 9=prots 10=deltamass 11=ppm 12=aaseq 13=base_name 14=first_scan 15=last_scan 16=charge 17=srf
661
+ SRF::OUT::Pep = ArrayClass.new(%w( mh deltacn_orig sp xcorr id num_other_loci rsp ions_matched ions_total sequence prots deltamass ppm aaseq base_name first_scan last_scan charge srf deltacn) )
662
+
663
+ # 0=mh 1=deltacn 2=sp 3=xcorr 4=id 5=num_other_loci 6=rsp 7=ions_matched 8=ions_total 9=sequence 10=prots 11=deltamass 12=ppm 13=aaseq 14=base_name 15=first_scan 16=last_scan 17=charge 18=srf 19=deltacn
413
664
 
414
665
  class SRF::OUT::Pep
415
666
  include SpecID::Pep
416
667
 
417
- Unpack = '@64Ex8ex12eeIx18vvvx8Z*@240Z*'
668
+ # creates the deltacn that is meaningful for the top hit (the deltacn_orig
669
+ # or the second best hit and so on).
670
+ # assumes sorted
671
+ def self.set_deltacn_from_deltacn_orig(ar)
672
+ (1...ar.size).each {|i| ar[i-1].deltacn = ar[i].deltacn_orig }
673
+ ar[-1].deltacn = 1.1
674
+ end
675
+
676
+ # same as set_deltacn_from_deltacn_orig except calculates with xcorr.
677
+ # assumes sorted
678
+ def self.set_deltacn_from_xcorr(ar)
679
+ if ar.size > 0
680
+ top_score = ar.first[3]
681
+ other_scores = (1...(ar.size)).to_a.map do |i|
682
+ (top_score - ar[i][3])/top_score
683
+ end
684
+ (0...(ar.size-1)).each do |i|
685
+ ar[i][19] = other_scores[i]
686
+ end
687
+ ar.last[19] = 1.1
688
+ end
689
+ end
690
+
691
+ def self.read_extra_references(fh, num_extra_references, pep_hits, global_ref_hash)
692
+ num_extra_references.times do
693
+ # 80 bytes total (with index number)
694
+ pep = pep_hits[fh.read(8).unpack('x4I').first - 1]
695
+
696
+ ref = fh.read(80).unpack('A*').first
697
+ pep[10] << pep.new_protein(ref[0,38], pep, global_ref_hash)
698
+ end
699
+ # fh.read(6) if unpack_35
700
+ end
701
+
702
+ # x2=???
703
+ #Unpack_35 = '@64Ex8ex12eeIx22vx2vvx8Z*@246Z*'
704
+ ### NOTE:
705
+ # I need to verify that this is correct (I mean the 'I' after x18)
706
+ Unpack_35 = '@64Ex8ex12eeIx18Ivx2vvx8Z*@246Z*'
707
+ # translation: @64=(64 bytes in to the record), E=mH, x8=8unknown bytes, e=deltacn,
708
+ # x12=12unknown bytes, e=sp, e=xcorr, I=ID#, x18=18 unknown bytes, v=rsp,
709
+ # v=ions_matched, v=ions_total, x8=8unknown bytes, Z*=sequence, 240Z*=at
710
+ # byte 240 grab the string (which is proteins).
711
+ #Unpack_32 = '@64Ex8ex12eeIx18vvvx8Z*@240Z*'
712
+ Unpack_32 = '@64Ex8ex12eeIx14Ivvvx8Z*@240Z*'
418
713
  Unpack_four_null_bytes = 'a*'
419
714
  Unpack_Zstar = 'Z*'
715
+ Read_35 = 426
716
+ Read_32 = 320
420
717
 
421
718
  FourNullBytes_as_string = "\0\0\0\0"
422
719
  #NewRecordStart = "\0\0" + 0x3a.chr + 0x1a.chr + "\0\0"
423
720
  NewRecordStart = 0x01.chr + 0x00.chr
424
721
  Sequest_record_start = "[SEQUEST]"
425
722
 
426
- tmp = $VERBOSE ; $VERBOSE = nil
427
- def prots() self[9] end
428
- $VERBOSE = tmp
429
-
723
+ undef_method :inspect
430
724
  def inspect
431
- st = %w(aaseq sequence mh deltacn sp xcorr id rsp ions_matched ions_total prots deltamass ppm base_name first_scan last_scan charge).map do |v|
432
- if v.is_a? Array
725
+ st = %w(aaseq sequence mh deltacn_orig sp xcorr id rsp ions_matched ions_total prots deltamass ppm base_name first_scan last_scan charge deltacn).map do |v|
726
+ if v == 'prots'
727
+ "#{v}(#)=#{send(v.to_sym).size}"
728
+ elsif v.is_a? Array
433
729
  "##{v}=#{send(v.to_sym).size}"
434
730
  else
435
- "@#{v}=#{send(v.to_sym)}"
731
+ "#{v}=#{send(v.to_sym).inspect}"
436
732
  end
437
733
  end
438
734
  st.unshift("<#{self.class}")
439
735
  if srf
440
- st.push("@srf(base_name)=#{srf.base_name}")
736
+ st.push("srf(base_name)=#{srf.base_name.inspect}")
441
737
  end
442
738
  st.push('>')
443
739
  st.join(' ')
444
740
  #"<SRF::OUT::Pep @mh=#{mh}, @deltacn=#{deltacn}, @sp=#{sp}, @xcorr=#{xcorr}, @id=#{id}, @rsp=#{rsp}, @ions_matched=#{ions_matched}, @ions_total=#{ions_total}, @sequence=#{sequence}, @prots(count)=#{prots.size}, @deltamass=#{deltamass}, @ppm=#{ppm} @aaseq=#{aaseq}, @base_name=#{base_name}, @first_scan=#{first_scan}, @last_scan=#{last_scan}, @charge=#{charge}, @srf(base_name)=#{srf.base_name}>"
445
741
  end
742
+ # extra_references_array is an array that grows with peptides as extra
743
+ # references are discovered.
744
+ def from_handle(fh, global_ref_hash, unpack_35)
745
+ unpack =
746
+ if unpack_35 ; Unpack_35
747
+ else ; Unpack_32
748
+ end
446
749
 
447
- ## There must be a better way to do this.
448
- ## We are checking that there are no additional protein references only
449
- ## so that we are in register for the next reading
450
- def read_extra_references(fh, global_ref_hash)
451
- $SRF_OUT_HIT_FH_POS = fh.pos
452
- st = fh.read(4)
453
- #puts "HHH: " + st.unpack("H*").first
454
- ## if we see 0000 0000 we are done
455
- if st.unpack(Unpack_four_null_bytes).first == FourNullBytes_as_string
456
- fh.pos = $SRF_OUT_HIT_FH_POS
457
- return nil
458
- end
459
- # read in context of 4 bytes read above:
460
-
461
- ## NOTE: in context of 4 bytes read above!
462
- st = fh.read(36)
463
- if st[34,2] == NewRecordStart
464
- fh.pos = $SRF_OUT_HIT_FH_POS
465
- return nil
466
- end
750
+ ## get the first part of the info
751
+ st = fh.read(( unpack_35 ? Read_35 : Read_32) ) ## read all the hit data
467
752
 
468
- # is this the end of the outfiles?
469
- ## BACK to beginning of this section
470
- fh.pos = $SRF_OUT_HIT_FH_POS
471
- if fh.read(9) == Sequest_record_start
472
- fh.pos = $SRF_OUT_HIT_FH_POS
473
- return
474
- end
753
+ self[0,10] = st.unpack(unpack)
475
754
 
476
- ## we have extra references
477
- ## original read was fh.read(79)
478
- fh.seek(-1, IO::SEEK_CUR)
479
- self[9].push( new_protein(fh.read(80).unpack(Unpack_Zstar).first, self, global_ref_hash ) )
755
+ # we are slicing the reference to 38 chars to be the same length as
756
+ # duplicate references
757
+ self[10] = [new_protein(self[10][0,38], self, global_ref_hash)]
480
758
 
481
- #p self.prots
482
- #puts self.prots.size
483
- #$glob ||= 0
484
- #$glob += 1
485
- #if $glob == 20
486
- # abort
487
- #end
488
-
489
- read_extra_references(fh,global_ref_hash)
759
+ self[13] = SpecID::Pep.sequence_to_aaseq(self[9])
760
+
761
+ fh.read(6) if unpack_35
762
+
763
+ self
490
764
  end
491
765
 
766
+
767
+
492
768
  def new_protein(reference, peptide, global_ref_hash)
493
769
  if global_ref_hash.key? reference
494
770
  global_ref_hash[reference].peps << peptide
@@ -498,29 +774,20 @@ class SRF::OUT::Pep
498
774
  global_ref_hash[reference]
499
775
  end
500
776
 
501
- def from_handle(fh, global_ref_hash)
502
- ## get the first part of the info
503
- st = fh.read(320) ## read all the hit data
504
- self[0,10] = st.unpack(Unpack)
505
- # we are slicing the reference to 38 chars to be the same length as
506
- # duplicate references
507
- self[9] = [new_protein(self[9][0,38], self, global_ref_hash)]
508
- self[12] = SpecID::Pep.sequence_to_aaseq(self[8])
509
- read_extra_references(fh, global_ref_hash)
510
-
511
- self
512
- end
513
-
514
- end
777
+ end
515
778
 
516
779
  SRF::OUT::Prot = ArrayClass.new( %w(reference peps) )
517
780
 
518
781
  class SRF::OUT::Prot
519
782
  include SpecID::Prot
783
+ # we shouldn't have to do this because this is inlcuded in SpecID::Prot, but
784
+ # under some circumstances it won't work without explicitly calling it.
785
+ include ProteinReferenceable
520
786
 
521
787
  tmp = $VERBOSE ; $VERBOSE = nil
522
788
  def initialize(reference=nil, peps=[])
523
- super(@@arr_size)
789
+ #super(@@arr_size)
790
+ super(size)
524
791
  #@reference = reference
525
792
  #@peps = peps
526
793
  self[0,2] = reference, peps
@@ -529,6 +796,7 @@ class SRF::OUT::Prot
529
796
 
530
797
  # "<SRF::OUT::Prot reference=\"#{@reference}\">"
531
798
 
799
+ undef_method :inspect
532
800
  def inspect
533
801
  "<SRF::OUT::Prot @reference=#{reference}, @peps(#)=#{peps.size}>"
534
802
  end