mspire 0.4.9 → 0.5.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (255) hide show
  1. data/README +27 -17
  2. data/changelog.txt +31 -62
  3. data/lib/ms/calc.rb +32 -0
  4. data/lib/ms/data/interleaved.rb +60 -0
  5. data/lib/ms/data/lazy_io.rb +73 -0
  6. data/lib/ms/data/lazy_string.rb +15 -0
  7. data/lib/ms/data/simple.rb +59 -0
  8. data/lib/ms/data/transposed.rb +41 -0
  9. data/lib/ms/data.rb +57 -0
  10. data/lib/ms/format/format_error.rb +12 -0
  11. data/lib/ms/spectrum.rb +25 -384
  12. data/lib/ms/support/binary_search.rb +126 -0
  13. data/lib/ms.rb +10 -10
  14. metadata +38 -350
  15. data/INSTALL +0 -58
  16. data/README.rdoc +0 -18
  17. data/Rakefile +0 -330
  18. data/bin/aafreqs.rb +0 -23
  19. data/bin/bioworks2excel.rb +0 -14
  20. data/bin/bioworks_to_pepxml.rb +0 -148
  21. data/bin/bioworks_to_pepxml_gui.rb +0 -225
  22. data/bin/fasta_shaker.rb +0 -5
  23. data/bin/filter_and_validate.rb +0 -5
  24. data/bin/gi2annot.rb +0 -14
  25. data/bin/id_class_anal.rb +0 -112
  26. data/bin/id_precision.rb +0 -172
  27. data/bin/ms_to_lmat.rb +0 -67
  28. data/bin/pepproph_filter.rb +0 -16
  29. data/bin/prob_validate.rb +0 -6
  30. data/bin/protein_summary.rb +0 -6
  31. data/bin/protxml2prots_peps.rb +0 -32
  32. data/bin/raw_to_mzXML.rb +0 -55
  33. data/bin/run_percolator.rb +0 -122
  34. data/bin/sqt_group.rb +0 -26
  35. data/bin/srf_group.rb +0 -27
  36. data/bin/srf_to_sqt.rb +0 -40
  37. data/lib/align/chams.rb +0 -78
  38. data/lib/align.rb +0 -154
  39. data/lib/archive/targz.rb +0 -94
  40. data/lib/bsearch.rb +0 -120
  41. data/lib/core_extensions.rb +0 -16
  42. data/lib/fasta.rb +0 -626
  43. data/lib/gi.rb +0 -124
  44. data/lib/group_by.rb +0 -10
  45. data/lib/index_by.rb +0 -11
  46. data/lib/merge_deep.rb +0 -21
  47. data/lib/ms/converter/mzxml.rb +0 -77
  48. data/lib/ms/gradient_program.rb +0 -170
  49. data/lib/ms/msrun.rb +0 -244
  50. data/lib/ms/msrun_index.rb +0 -108
  51. data/lib/ms/parser/mzdata/axml.rb +0 -67
  52. data/lib/ms/parser/mzdata/dom.rb +0 -175
  53. data/lib/ms/parser/mzdata/libxml.rb +0 -7
  54. data/lib/ms/parser/mzdata.rb +0 -31
  55. data/lib/ms/parser/mzxml/axml.rb +0 -70
  56. data/lib/ms/parser/mzxml/dom.rb +0 -182
  57. data/lib/ms/parser/mzxml/hpricot.rb +0 -253
  58. data/lib/ms/parser/mzxml/libxml.rb +0 -19
  59. data/lib/ms/parser/mzxml/regexp.rb +0 -122
  60. data/lib/ms/parser/mzxml/rexml.rb +0 -72
  61. data/lib/ms/parser/mzxml/xmlparser.rb +0 -248
  62. data/lib/ms/parser/mzxml.rb +0 -282
  63. data/lib/ms/parser.rb +0 -108
  64. data/lib/ms/precursor.rb +0 -25
  65. data/lib/ms/scan.rb +0 -81
  66. data/lib/mspire.rb +0 -4
  67. data/lib/pi_zero.rb +0 -244
  68. data/lib/qvalue.rb +0 -161
  69. data/lib/roc.rb +0 -187
  70. data/lib/sample_enzyme.rb +0 -160
  71. data/lib/scan_i.rb +0 -21
  72. data/lib/spec_id/aa_freqs.rb +0 -170
  73. data/lib/spec_id/bioworks.rb +0 -497
  74. data/lib/spec_id/digestor.rb +0 -138
  75. data/lib/spec_id/mass.rb +0 -179
  76. data/lib/spec_id/parser/proph.rb +0 -335
  77. data/lib/spec_id/precision/filter/cmdline.rb +0 -218
  78. data/lib/spec_id/precision/filter/interactive.rb +0 -134
  79. data/lib/spec_id/precision/filter/output.rb +0 -148
  80. data/lib/spec_id/precision/filter.rb +0 -637
  81. data/lib/spec_id/precision/output.rb +0 -60
  82. data/lib/spec_id/precision/prob/cmdline.rb +0 -160
  83. data/lib/spec_id/precision/prob/output.rb +0 -94
  84. data/lib/spec_id/precision/prob.rb +0 -249
  85. data/lib/spec_id/proph/pep_summary.rb +0 -104
  86. data/lib/spec_id/proph/prot_summary.rb +0 -484
  87. data/lib/spec_id/proph.rb +0 -4
  88. data/lib/spec_id/protein_summary.rb +0 -489
  89. data/lib/spec_id/sequest/params.rb +0 -316
  90. data/lib/spec_id/sequest/pepxml.rb +0 -1458
  91. data/lib/spec_id/sequest.rb +0 -33
  92. data/lib/spec_id/sqt.rb +0 -349
  93. data/lib/spec_id/srf.rb +0 -973
  94. data/lib/spec_id.rb +0 -778
  95. data/lib/spec_id_xml.rb +0 -99
  96. data/lib/transmem/phobius.rb +0 -147
  97. data/lib/transmem/toppred.rb +0 -368
  98. data/lib/transmem.rb +0 -157
  99. data/lib/validator/aa.rb +0 -48
  100. data/lib/validator/aa_est.rb +0 -112
  101. data/lib/validator/background.rb +0 -77
  102. data/lib/validator/bias.rb +0 -95
  103. data/lib/validator/cmdline.rb +0 -431
  104. data/lib/validator/decoy.rb +0 -107
  105. data/lib/validator/digestion_based.rb +0 -70
  106. data/lib/validator/probability.rb +0 -51
  107. data/lib/validator/prot_from_pep.rb +0 -234
  108. data/lib/validator/q_value.rb +0 -32
  109. data/lib/validator/transmem.rb +0 -272
  110. data/lib/validator/true_pos.rb +0 -46
  111. data/lib/validator.rb +0 -197
  112. data/lib/xml.rb +0 -38
  113. data/lib/xml_style_parser.rb +0 -119
  114. data/lib/xmlparser_wrapper.rb +0 -19
  115. data/release_notes.txt +0 -2
  116. data/script/compile_and_plot_smriti_final.rb +0 -97
  117. data/script/create_little_pepxml.rb +0 -61
  118. data/script/degenerate_peptides.rb +0 -47
  119. data/script/estimate_fpr_by_cysteine.rb +0 -226
  120. data/script/extract_gradient_programs.rb +0 -56
  121. data/script/find_cysteine_background.rb +0 -137
  122. data/script/genuine_tps_and_probs.rb +0 -136
  123. data/script/get_apex_values_rexml.rb +0 -44
  124. data/script/histogram_probs.rb +0 -61
  125. data/script/mascot_fix_pepxml.rb +0 -123
  126. data/script/msvis.rb +0 -42
  127. data/script/mzXML2timeIndex.rb +0 -25
  128. data/script/peps_per_bin.rb +0 -67
  129. data/script/prep_dir.rb +0 -121
  130. data/script/simple_protein_digestion.rb +0 -27
  131. data/script/smriti_final_analysis.rb +0 -103
  132. data/script/sqt_to_meta.rb +0 -24
  133. data/script/top_hit_per_scan.rb +0 -67
  134. data/script/toppred_to_yaml.rb +0 -47
  135. data/script/tpp_installer.rb +0 -249
  136. data/specs/align_spec.rb +0 -79
  137. data/specs/bin/bioworks_to_pepxml_spec.rb +0 -79
  138. data/specs/bin/fasta_shaker_spec.rb +0 -259
  139. data/specs/bin/filter_and_validate__multiple_vals_helper.yaml +0 -199
  140. data/specs/bin/filter_and_validate_spec.rb +0 -180
  141. data/specs/bin/ms_to_lmat_spec.rb +0 -34
  142. data/specs/bin/prob_validate_spec.rb +0 -86
  143. data/specs/bin/protein_summary_spec.rb +0 -14
  144. data/specs/fasta_spec.rb +0 -354
  145. data/specs/gi_spec.rb +0 -22
  146. data/specs/load_bin_path.rb +0 -7
  147. data/specs/merge_deep_spec.rb +0 -13
  148. data/specs/ms/gradient_program_spec.rb +0 -77
  149. data/specs/ms/msrun_spec.rb +0 -498
  150. data/specs/ms/parser_spec.rb +0 -92
  151. data/specs/ms/spectrum_spec.rb +0 -87
  152. data/specs/pi_zero_spec.rb +0 -115
  153. data/specs/qvalue_spec.rb +0 -39
  154. data/specs/roc_spec.rb +0 -251
  155. data/specs/rspec_autotest.rb +0 -149
  156. data/specs/sample_enzyme_spec.rb +0 -126
  157. data/specs/spec_helper.rb +0 -135
  158. data/specs/spec_id/aa_freqs_spec.rb +0 -52
  159. data/specs/spec_id/bioworks_spec.rb +0 -148
  160. data/specs/spec_id/digestor_spec.rb +0 -75
  161. data/specs/spec_id/precision/filter/cmdline_spec.rb +0 -20
  162. data/specs/spec_id/precision/filter/output_spec.rb +0 -31
  163. data/specs/spec_id/precision/filter_spec.rb +0 -246
  164. data/specs/spec_id/precision/prob_spec.rb +0 -44
  165. data/specs/spec_id/precision/prob_spec_helper.rb +0 -0
  166. data/specs/spec_id/proph/pep_summary_spec.rb +0 -98
  167. data/specs/spec_id/proph/prot_summary_spec.rb +0 -128
  168. data/specs/spec_id/protein_summary_spec.rb +0 -189
  169. data/specs/spec_id/sequest/params_spec.rb +0 -68
  170. data/specs/spec_id/sequest/pepxml_spec.rb +0 -374
  171. data/specs/spec_id/sequest_spec.rb +0 -38
  172. data/specs/spec_id/sqt_spec.rb +0 -246
  173. data/specs/spec_id/srf_spec.rb +0 -172
  174. data/specs/spec_id/srf_spec_helper.rb +0 -139
  175. data/specs/spec_id_helper.rb +0 -33
  176. data/specs/spec_id_spec.rb +0 -366
  177. data/specs/spec_id_xml_spec.rb +0 -33
  178. data/specs/transmem/phobius_spec.rb +0 -425
  179. data/specs/transmem/toppred_spec.rb +0 -298
  180. data/specs/transmem_spec.rb +0 -60
  181. data/specs/transmem_spec_shared.rb +0 -64
  182. data/specs/validator/aa_est_spec.rb +0 -66
  183. data/specs/validator/aa_spec.rb +0 -40
  184. data/specs/validator/background_spec.rb +0 -67
  185. data/specs/validator/bias_spec.rb +0 -122
  186. data/specs/validator/decoy_spec.rb +0 -51
  187. data/specs/validator/fasta_helper.rb +0 -26
  188. data/specs/validator/prot_from_pep_spec.rb +0 -141
  189. data/specs/validator/transmem_spec.rb +0 -146
  190. data/specs/validator/true_pos_spec.rb +0 -58
  191. data/specs/validator_helper.rb +0 -33
  192. data/specs/xml_spec.rb +0 -12
  193. data/test_files/000_pepxml18_small.xml +0 -206
  194. data/test_files/020a.mzXML.timeIndex +0 -4710
  195. data/test_files/4-03-03_mzXML/000.mzXML.timeIndex +0 -3973
  196. data/test_files/4-03-03_mzXML/020.mzXML.timeIndex +0 -3872
  197. data/test_files/4-03-03_small-prot.xml +0 -321
  198. data/test_files/4-03-03_small.xml +0 -3876
  199. data/test_files/7MIX_STD_110802_1.sequest_params_fragment.srf +0 -0
  200. data/test_files/bioworks-3.3_10prots.xml +0 -5999
  201. data/test_files/bioworks31.params +0 -77
  202. data/test_files/bioworks32.params +0 -62
  203. data/test_files/bioworks33.params +0 -63
  204. data/test_files/bioworks_single_run_small.xml +0 -7237
  205. data/test_files/bioworks_small.fasta +0 -212
  206. data/test_files/bioworks_small.params +0 -63
  207. data/test_files/bioworks_small.phobius +0 -109
  208. data/test_files/bioworks_small.toppred.out +0 -2847
  209. data/test_files/bioworks_small.xml +0 -5610
  210. data/test_files/bioworks_with_INV_small.xml +0 -3753
  211. data/test_files/bioworks_with_SHUFF_small.xml +0 -2503
  212. data/test_files/corrupted_900.srf +0 -0
  213. data/test_files/head_of_7MIX.srf +0 -0
  214. data/test_files/interact-opd1_mods_small-prot.xml +0 -304
  215. data/test_files/messups.fasta +0 -297
  216. data/test_files/opd1/000.my_answer.100lines.xml +0 -101
  217. data/test_files/opd1/000.tpp_1.2.3.first10.xml +0 -115
  218. data/test_files/opd1/000.tpp_2.9.2.first10.xml +0 -126
  219. data/test_files/opd1/000.v2.1.mzXML.timeIndex +0 -3748
  220. data/test_files/opd1/000_020-prot.png +0 -0
  221. data/test_files/opd1/000_020_3prots-prot.mod_initprob.xml +0 -62
  222. data/test_files/opd1/000_020_3prots-prot.xml +0 -62
  223. data/test_files/opd1/opd1_cat_inv_small-prot.xml +0 -139
  224. data/test_files/opd1/sequest.3.1.params +0 -77
  225. data/test_files/opd1/sequest.3.2.params +0 -62
  226. data/test_files/opd1/twenty_scans.mzXML +0 -418
  227. data/test_files/opd1/twenty_scans.v2.1.mzXML +0 -382
  228. data/test_files/opd1/twenty_scans_answ.lmat +0 -0
  229. data/test_files/opd1/twenty_scans_answ.lmata +0 -9
  230. data/test_files/opd1_020_beginning.RAW +0 -0
  231. data/test_files/opd1_2runs_2mods/data/020.mzData.xml +0 -683
  232. data/test_files/opd1_2runs_2mods/data/020.readw.mzXML +0 -382
  233. data/test_files/opd1_2runs_2mods/data/040.mzData.xml +0 -683
  234. data/test_files/opd1_2runs_2mods/data/040.readw.mzXML +0 -382
  235. data/test_files/opd1_2runs_2mods/data/README.txt +0 -6
  236. data/test_files/opd1_2runs_2mods/interact-opd1_mods__small.xml +0 -753
  237. data/test_files/orbitrap_mzData/000_cut.xml +0 -1920
  238. data/test_files/pepproph_small.xml +0 -4691
  239. data/test_files/phobius.small.noheader.txt +0 -50
  240. data/test_files/phobius.small.small.txt +0 -53
  241. data/test_files/s01_anC1_ld020mM.key.txt +0 -25
  242. data/test_files/s01_anC1_ld020mM.meth +0 -0
  243. data/test_files/small.fasta +0 -297
  244. data/test_files/small.sqt +0 -87
  245. data/test_files/smallraw.RAW +0 -0
  246. data/test_files/tf_bioworks2excel.bioXML +0 -14340
  247. data/test_files/tf_bioworks2excel.txt.actual +0 -1035
  248. data/test_files/toppred.small.out +0 -416
  249. data/test_files/toppred.xml.out +0 -318
  250. data/test_files/validator_hits_separate/bias_bioworks_small_HS.fasta +0 -7
  251. data/test_files/validator_hits_separate/bioworks_small_HS.xml +0 -5651
  252. data/test_files/yeast_gly_small-prot.xml +0 -265
  253. data/test_files/yeast_gly_small.1.0_1.0_1.0.parentTimes +0 -6
  254. data/test_files/yeast_gly_small.xml +0 -3807
  255. data/test_files/yeast_gly_small2.parentTimes +0 -6
data/lib/transmem.rb DELETED
@@ -1,157 +0,0 @@
1
-
2
- # A transmemIndex is a hash that takes a fasta reference as key and returns
3
- # a structured hash containing the transmembrane information.
4
- module TransmemIndex
5
-
6
- # returns :toppred or :phobius
7
- def self.filetype(file)
8
- tp = nil
9
- File.open(file) do |fh|
10
- while (line = fh.gets)
11
- case line
12
- when /SEQENCE/
13
- tp = :phobius
14
- break
15
- when / 0 0 i/
16
- tp = :phobius # if they don't have the headers,
17
- # this will pick it up if they have a
18
- # single prot without tm or signal peptide.
19
- break
20
- when /Algorithm specific parameters/
21
- tp = :toppred # New text
22
- break
23
- when /<parameters>/
24
- tp = :toppred # XML
25
- break
26
- end
27
- end
28
- end
29
- tp
30
- end
31
-
32
- def reference_to_key(reference)
33
- # needs to be subclassed or written
34
- end
35
-
36
- # right now accepts toppred.out files
37
- # Phobius objects can use the fasta object to update their hash for methods
38
- # like avg_overlap
39
- def self.new(file, fasta=nil)
40
- case x = filetype(file)
41
- when :toppred
42
- require 'transmem/toppred'
43
- TopPred::Index.new(file)
44
- when :phobius
45
- require 'transmem/phobius'
46
- # warn "WARNING: You have NO fasta object with Phobius based TransmemIndex! (which needs one to do proper indexing!)" unless fasta
47
- Phobius::Index.new(file, fasta)
48
- else
49
- raise ArgumentError, "#{x} filetype for #{file} not recognized!"
50
- end
51
- end
52
-
53
- # returns a hash of key -> num certain transmembrane segments
54
- def num_certain_index
55
- hash = {}
56
- self.each do |k,v|
57
- hash[k] = v[:num_certain_transmembrane_segments] || 0
58
- end
59
- hash
60
- end
61
-
62
- # tp = :number or :fraction which is the fraction of the sequence size
63
- # returns the average number of overlapping amino acids with transmembrane
64
- # segments
65
- # returns nil if there is no protein by that key
66
- def avg_overlap(key, sequence, tp=:number)
67
- if self.key? key
68
- numbers = num_transmem_aa(self[key], sequence)
69
- if numbers.size > 0
70
- sum = 0
71
- numbers.each {|num| sum += num}
72
- avg_num = sum.to_f / numbers.size
73
- # the one line way to do it
74
- #avg_num = numbers.inject(0) {|memo,num| num + memo }.to_f / numbers.size
75
- if tp == :fraction
76
- avg_num / sequence.size
77
- # this is the same as doing this:
78
- #numbers.inject(0.0) {|memo,num| (num.to_f/seq_size + memo) } / numbers.size
79
- else
80
- avg_num
81
- end
82
- else
83
- 0.0
84
- end
85
- else # what to do if the protein isn't there?? which happens on occasion
86
- nil
87
- end
88
- end
89
-
90
- # returns an array (usually length of 1) of the number of amino acids
91
- # contained inside transmembrane spanning segments.
92
- # assumes that tmhash has the key 'transmembrane_segments'
93
- # if there are no transmembrane segments, returns empty array.
94
- def num_transmem_aa(tmhash, sequence)
95
- if tmhash.key? :transmembrane_segments
96
- ranges = tmhash[:transmembrane_segments].map do |tmseg|
97
- Range.new( tmseg[:start]-1, tmseg[:stop]-1 )
98
- end
99
- num_overlapping_chars(tmhash[:aaseq], ranges, sequence)
100
- else
101
- []
102
- end
103
- end
104
-
105
- # returns an array of the number of overlapping sequences in substring with
106
- # the substrings defined in start_stop_doublets within full_sequence
107
- # start_stop_doublets should be 0 indexed!!!
108
- # the span includes the 'stop' position i.e., full_sequence[start..stop]
109
- def num_overlapping_chars(full_sequence, ranges, substring)
110
- #start_positions = aaseq.enum_for(:scan, substring).map { $~.offset(0)[0]}
111
- if ranges.size == 0
112
- []
113
- #full_sequence.enum_for(:scan, substring).map { 0 }
114
- else
115
- substring_ranges = []
116
- pos = 0
117
- slen = substring.size
118
- while i=full_sequence.index(substring,pos)
119
- substring_ranges << Range.new(i, i+slen-1)
120
- pos = i + slen
121
- end
122
- # brute force way
123
- last_tm_range = ranges.last.last
124
- to_return = substring_ranges.map do |sb|
125
- overlap = 0
126
- # there's got to be a much simpler way to do this, but this does work...
127
- ranges.each do |tm|
128
- (frst, lst) =
129
- if tm.include?( sb.first )
130
- [tm, sb]
131
- elsif tm.include?( sb.last )
132
- [sb, tm]
133
- else
134
- nil
135
- end
136
- if frst
137
- if lst.last <= frst.last
138
- overlap += (frst.last+1 - frst.first) - (lst.first - frst.first) - (frst.last - lst.last)
139
- else
140
- overlap += (frst.last+1 - frst.first) - (lst.first - frst.first)
141
- end
142
- end
143
- end
144
- overlap
145
- end
146
- end
147
- end
148
-
149
-
150
- end
151
-
152
-
153
- #substring_ranges = full_sequence.enum_for(:scan, substring).map do
154
- # (ofirst, olast) = $~.offset(0)
155
- # Range.new(ofirst, olast - 1)
156
- # end
157
-
data/lib/validator/aa.rb DELETED
@@ -1,48 +0,0 @@
1
- require 'validator/digestion_based'
2
- require 'fasta'
3
- require 'spec_id/aa_freqs'
4
-
5
- # Constraints on aaseq attribute of peptides (the bare amino acid sequence)
6
- # works by calculating amino acid frequencies in the fasta file used.
7
- class Validator::AA < Validator::DigestionBased
8
- include Precision::Calculator
9
-
10
- attr_accessor :constraint
11
-
12
- # it is a false hit if the amino acid is located in the peptide
13
- attr_accessor :false_if_found
14
-
15
- DEFAULTS = Validator::DigestionBased::DEFAULTS.merge( {
16
- :false_if_found => true,
17
- } )
18
-
19
- # returns tp, fp
20
- def partition(peps)
21
- (found, not_found) = peps.partition do |pep|
22
- pep.aaseq.include?(@constraint)
23
- end
24
- if @false_if_found
25
- [not_found, found]
26
- else
27
- [found, not_found]
28
- end
29
- end
30
-
31
- # right now only accepts single amino acids as constraints (as a string,
32
- # e.g. 'C', or symbol, e.g. :C)
33
- # options:
34
- # :false_to_total_ratio => if a true digestion was already performed (see
35
- # Validator::AA.calc_false_to_total_ratio)
36
- # :false_if_found => it is a false positive if the amino acid is found.
37
- # :background => the background level of amino acid Float
38
- def initialize(constraint, options={})
39
- @constraint = constraint.to_s
40
- opts = DEFAULTS.merge(options)
41
- (@false_to_total_ratio, @false_if_found, @background) = opts.values_at(:false_to_total_ratio, :false_if_found, :background)
42
- end
43
-
44
- def to_param_string
45
- "aminoacid(bad_aa)=" + ["{constraint=#{@constraint}", "false_to_total_ratio=#{@false_to_total_ratio}", "bkg=#{(@background ? @background : 0.0) }}"].join(", ")
46
- end
47
- end
48
-
@@ -1,112 +0,0 @@
1
- require 'validator/aa'
2
-
3
-
4
- class Validator ; end
5
- class Validator::AA ; end
6
-
7
- # A class that uses the peps given to it and a background frequency to
8
- # calculate the false_to_total_ratio at each turn.
9
- class Validator::AAEst < Validator::AA
10
- attr_accessor :constraint
11
- attr_accessor :false_if_found
12
-
13
- # the frequency of the amino acid is used to estimate the false to
14
- # total ratio based on the pephits given for pephit_precision.
15
- # see Validator::AA.calc_frequency to calculate a frequency
16
- # or use set_frequency to set from pep hits.
17
- attr_accessor :frequency
18
-
19
- DEFAULTS = {
20
- :false_if_found => true
21
- }.merge(Validator::DigestionBased::DEFAULTS) # background 0.0
22
-
23
- # only takes a string right now for constraint
24
- def initialize(constraint, options={})
25
- @constraint = constraint.to_s
26
- opts = DEFAULTS.merge(options)
27
- (@frequency, @false_if_found, @background) = opts.values_at(:frequency, :false_if_found, :background)
28
- end
29
-
30
- def pephit_precision(peps)
31
- set_false_to_total_ratio(peps)
32
- super(peps)
33
- end
34
-
35
- def set_false_to_total_ratio(peps)
36
- if peps.size > 0
37
- expected = 0.0
38
- peps.each do |pep|
39
- expected += (1.0 - ((1.0 - @frequency)**pep.aaseq.size))
40
- end
41
- @false_to_total_ratio = expected / peps.size
42
- else
43
- @false_to_total_ratio = 1.0
44
- end
45
- end
46
-
47
- def set_ongoing_false_to_total_ratio(peps)
48
- if peps.size > 0
49
- peps.each do |pep|
50
- @expected += (1.0 - ((1.0-@frequency)**pep.aaseq.size))
51
- end
52
- # @increment_total_submitted should == @increment_tps and @increment_fps
53
- # since these are either/or
54
- @false_to_total_ratio = @expected / @increment_total_submitted
55
- else
56
- @false_to_total_ratio = 1.0
57
- end
58
- end
59
-
60
-
61
- def to_param_string
62
- "aminoacid(bad_aa)=" + ["{constraint=#{@constraint}", "frequency=#{@frequency}", "bkg=#{(@background ? @background : 0.0) }}"].join(", ")
63
- end
64
-
65
- # takes objects responding to aaseq and sets the frequency based on
66
- # constraint. constraint is one acceptable to initialize! returns self
67
- def set_frequency(objs)
68
- table = SpecID::AAFreqs.new.calculate_frequencies(objs)
69
- @frequency = table[@constraint.to_sym]
70
- self
71
- end
72
-
73
- # if adding pephits in groups at a time, the entire group does not need to be
74
- # queried, just the individual hit. Use this OR pephits_precision (NOT
75
- # both). The initial query to this method will begin a running tally that
76
- # is saved by the validator.
77
- # takes either an array or a single pephit (determined by if it is a
78
- # SpecID::Pep)
79
- def increment_pephits_precision(peps)
80
- tmp = $VERBOSE; $VERBOSE = nil
81
- unless @increment_initialized
82
- initialize_increment
83
- @expected = 0.0
84
- end
85
- $VERBOSE = tmp
86
-
87
- to_submit =
88
- if peps.is_a? SpecID::Pep
89
- [peps]
90
- else
91
- peps
92
- end
93
- @increment_total_submitted += to_submit.size
94
- (tps, fps) = partition(to_submit)
95
- #### THIS IS THE MAGIC FOR THIS VALIDATOR:
96
- set_ongoing_false_to_total_ratio(to_submit)
97
-
98
- @increment_tps += tps.size
99
- @increment_fps += fps.size
100
- (num_tps, num_fps) =
101
- if self.respond_to?(:calc_precision_prep) # for digestion based validators
102
- (num_tps, num_fps) = calc_precision_prep(@increment_tps, @increment_fps)
103
- [num_tps, num_fps]
104
- else
105
- [@increment_tps, @increment_fps]
106
- end
107
- calc_precision(num_tps, num_fps)
108
- end
109
-
110
-
111
-
112
- end
@@ -1,77 +0,0 @@
1
- require 'validator'
2
- require 'vec'
3
- require 'enumerator'
4
-
5
- class Validator ; end
6
- class Validator::Background
7
-
8
- attr_accessor :data
9
-
10
- def initialize(data=nil)
11
- @data = data
12
- end
13
-
14
- def delete_nan!(vec)
15
- vec.each_with_index do |v,i|
16
- if v.nan?
17
- vec[i] = 0
18
- end
19
- end
20
- end
21
-
22
- def stdev_plus_spread(stdev_factor=2.0, stdev_points=15, min_window_pre=5, min_window_post=5)
23
- data_vec = VecD[*@data]
24
- delete_nan!(data_vec)
25
- stdev_transform = data_vec.transform(9) {|vec| (stdev_factor * vec.sample_stats[1]) + vec.spread }
26
- smoothed_stdev = stdev_transform.transform(9) {|vec| vec.avg }
27
- smoothed_stdev_derivs = smoothed_stdev.chim
28
- last_0_index = index_of_last_0(smoothed_stdev_derivs)
29
- min_in_window(data_vec, last_0_index, min_window_pre, min_window_post)
30
- end
31
-
32
- def plot(vec)
33
- `graph #{vec.join(" ")} -a -T X`
34
- end
35
-
36
- # not really working right currently
37
- def derivs(avg_points=15, min_window_pre=5, min_window_post=5)
38
- data_vec = VecD[*@data]
39
- delete_nan!(data_vec)
40
- drvs = data_vec.chim
41
- # absolute value
42
- drvs.each_with_index {|x,i| drvs[i] = x.abs }
43
- mv_avg = drvs.transform(avg_points) {|v| v.avg }
44
- last_0_index = index_of_last_0(mv_avg.chim)
45
- min_in_window(data_vec, last_0_index, min_window_pre, min_window_post)
46
- end
47
-
48
- def index_of_last_0(vec)
49
- last_0_index = nil
50
- vec.each_with_index do |v,i|
51
- if v == 0
52
- last_0_index = i
53
- end
54
- end
55
- last_0_index
56
- end
57
-
58
- # returns the minimum value in the window centered on index
59
- def min_in_window(vec, index, pre, post)
60
- last_index = vec.size - 1
61
- start = index - pre
62
- stop = index + post
63
- start = 0 if start < 0
64
- stop = last_index if stop > last_index
65
- vec[start..stop].min
66
- end
67
-
68
- # very simple, should work
69
- def min_mesa(start, stop, points=3)
70
- data_vec = VecD[*@data]
71
- delete_nan!(data_vec)
72
- smoothed = data_vec.transform(3) {|v| v.avg }
73
- smoothed[start..stop].min
74
- end
75
-
76
- end
77
-
@@ -1,95 +0,0 @@
1
- require 'validator'
2
- require 'validator/digestion_based'
3
-
4
- # class for any generic kind of bias. For instance, a list of high abundance
5
- # proteins we would expect to see, or a list of low abundance proteins we
6
- # would not expect to see, or proteins that have been filtered out in some
7
- # way, etc.
8
- class Validator::Bias < Validator::DigestionBased
9
- include Precision::Calculator
10
-
11
- # a fasta object (by default containing proteins expected to be in the
12
- # sample [see proteins_expected to modify that behavior])
13
- attr_reader :fasta
14
-
15
- # correct_wins means that only a single protein from a pep.aaseq must match
16
- # the fasta object for the pep hit to be considered valid. Otherwise, all
17
- # must be a match (logic negated by proteins_expected)
18
- attr_accessor :correct_wins
19
-
20
- # proteins_expected==true means we expect to see the proteins in the sample
21
- # proteins_expected==false means we do not expect to see these proteins in
22
- # the sample
23
- attr_accessor :proteins_expected
24
-
25
- # a hash made by taking each fasta reference in fasta_object, (everything
26
- # until a space) and setting the value to true. It can be queried with the
27
- # start of an fasta sequence
28
- attr_accessor :short_reference_hash
29
-
30
- DEFAULTS = Validator::DigestionBased::DEFAULTS.merge( {
31
- :proteins_expected => true,
32
- :correct_wins => true,
33
- } )
34
-
35
- # options:
36
- # (t = true, f = false, '*'= default)
37
- # :proteins_expected => *t/f we expect to see the fasta proteins in our hit list
38
- # :correct_wins => *t/f a single peptide hit from one of these proteins
39
- # constitutes a true positive
40
- # :background => Float (*0.0-1.0)
41
- # :false_to_total_ratio => Float (*nil by default)
42
- def initialize(fasta_object, options={})
43
- opts = DEFAULTS.merge(options)
44
- (@proteins_expected, @correct_wins, @background, @false_to_total_ratio) = opts.values_at(:proteins_expected, :correct_wins, :background, :false_to_total_ratio)
45
- @fasta = fasta_object
46
- @header_split_hash = @fasta.prots.map {|prot| prot.reference }
47
- @short_reference_hash = self.class.make_short_reference_hash(fasta_object)
48
- end
49
-
50
- def self.make_short_reference_hash(fasta_object)
51
- hash = {}
52
- fasta_object.each do |prot|
53
- hash[prot.first_entry] = true
54
- end
55
- hash
56
- end
57
-
58
- def partition(peps)
59
- klass = self.class
60
- cw =
61
- if !@proteins_expected
62
- !@correct_wins
63
- else
64
- @correct_wins
65
- end
66
-
67
- (tp, fp) =
68
- if cw
69
- peps.partition do |pep|
70
- pep.prots.any? do |pepprot|
71
- @short_reference_hash.key?( pepprot.first_entry )
72
- end
73
- end
74
- else
75
- peps.partition do |pep|
76
- pep.prots.any? do |pepprot|
77
- !@short_reference_hash.key?( pepprot.first_entry )
78
- end
79
- end
80
- end
81
-
82
- if !@correct_wins
83
- tp, fp = fp, tp
84
- end
85
-
86
- [tp, fp]
87
- end
88
-
89
- # pephit_precision is done through inheritance
90
-
91
- def to_param_string
92
- "abundance=" + ["{fasta=#{@fasta.filename}", "proteins_expected=#{@proteins_expected}", "correct_wins=#{@correct_wins}", "background=#{@background}}"].join(", ")
93
- end
94
-
95
- end