mspire 0.4.9 → 0.5.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (255) hide show
  1. data/README +27 -17
  2. data/changelog.txt +31 -62
  3. data/lib/ms/calc.rb +32 -0
  4. data/lib/ms/data/interleaved.rb +60 -0
  5. data/lib/ms/data/lazy_io.rb +73 -0
  6. data/lib/ms/data/lazy_string.rb +15 -0
  7. data/lib/ms/data/simple.rb +59 -0
  8. data/lib/ms/data/transposed.rb +41 -0
  9. data/lib/ms/data.rb +57 -0
  10. data/lib/ms/format/format_error.rb +12 -0
  11. data/lib/ms/spectrum.rb +25 -384
  12. data/lib/ms/support/binary_search.rb +126 -0
  13. data/lib/ms.rb +10 -10
  14. metadata +38 -350
  15. data/INSTALL +0 -58
  16. data/README.rdoc +0 -18
  17. data/Rakefile +0 -330
  18. data/bin/aafreqs.rb +0 -23
  19. data/bin/bioworks2excel.rb +0 -14
  20. data/bin/bioworks_to_pepxml.rb +0 -148
  21. data/bin/bioworks_to_pepxml_gui.rb +0 -225
  22. data/bin/fasta_shaker.rb +0 -5
  23. data/bin/filter_and_validate.rb +0 -5
  24. data/bin/gi2annot.rb +0 -14
  25. data/bin/id_class_anal.rb +0 -112
  26. data/bin/id_precision.rb +0 -172
  27. data/bin/ms_to_lmat.rb +0 -67
  28. data/bin/pepproph_filter.rb +0 -16
  29. data/bin/prob_validate.rb +0 -6
  30. data/bin/protein_summary.rb +0 -6
  31. data/bin/protxml2prots_peps.rb +0 -32
  32. data/bin/raw_to_mzXML.rb +0 -55
  33. data/bin/run_percolator.rb +0 -122
  34. data/bin/sqt_group.rb +0 -26
  35. data/bin/srf_group.rb +0 -27
  36. data/bin/srf_to_sqt.rb +0 -40
  37. data/lib/align/chams.rb +0 -78
  38. data/lib/align.rb +0 -154
  39. data/lib/archive/targz.rb +0 -94
  40. data/lib/bsearch.rb +0 -120
  41. data/lib/core_extensions.rb +0 -16
  42. data/lib/fasta.rb +0 -626
  43. data/lib/gi.rb +0 -124
  44. data/lib/group_by.rb +0 -10
  45. data/lib/index_by.rb +0 -11
  46. data/lib/merge_deep.rb +0 -21
  47. data/lib/ms/converter/mzxml.rb +0 -77
  48. data/lib/ms/gradient_program.rb +0 -170
  49. data/lib/ms/msrun.rb +0 -244
  50. data/lib/ms/msrun_index.rb +0 -108
  51. data/lib/ms/parser/mzdata/axml.rb +0 -67
  52. data/lib/ms/parser/mzdata/dom.rb +0 -175
  53. data/lib/ms/parser/mzdata/libxml.rb +0 -7
  54. data/lib/ms/parser/mzdata.rb +0 -31
  55. data/lib/ms/parser/mzxml/axml.rb +0 -70
  56. data/lib/ms/parser/mzxml/dom.rb +0 -182
  57. data/lib/ms/parser/mzxml/hpricot.rb +0 -253
  58. data/lib/ms/parser/mzxml/libxml.rb +0 -19
  59. data/lib/ms/parser/mzxml/regexp.rb +0 -122
  60. data/lib/ms/parser/mzxml/rexml.rb +0 -72
  61. data/lib/ms/parser/mzxml/xmlparser.rb +0 -248
  62. data/lib/ms/parser/mzxml.rb +0 -282
  63. data/lib/ms/parser.rb +0 -108
  64. data/lib/ms/precursor.rb +0 -25
  65. data/lib/ms/scan.rb +0 -81
  66. data/lib/mspire.rb +0 -4
  67. data/lib/pi_zero.rb +0 -244
  68. data/lib/qvalue.rb +0 -161
  69. data/lib/roc.rb +0 -187
  70. data/lib/sample_enzyme.rb +0 -160
  71. data/lib/scan_i.rb +0 -21
  72. data/lib/spec_id/aa_freqs.rb +0 -170
  73. data/lib/spec_id/bioworks.rb +0 -497
  74. data/lib/spec_id/digestor.rb +0 -138
  75. data/lib/spec_id/mass.rb +0 -179
  76. data/lib/spec_id/parser/proph.rb +0 -335
  77. data/lib/spec_id/precision/filter/cmdline.rb +0 -218
  78. data/lib/spec_id/precision/filter/interactive.rb +0 -134
  79. data/lib/spec_id/precision/filter/output.rb +0 -148
  80. data/lib/spec_id/precision/filter.rb +0 -637
  81. data/lib/spec_id/precision/output.rb +0 -60
  82. data/lib/spec_id/precision/prob/cmdline.rb +0 -160
  83. data/lib/spec_id/precision/prob/output.rb +0 -94
  84. data/lib/spec_id/precision/prob.rb +0 -249
  85. data/lib/spec_id/proph/pep_summary.rb +0 -104
  86. data/lib/spec_id/proph/prot_summary.rb +0 -484
  87. data/lib/spec_id/proph.rb +0 -4
  88. data/lib/spec_id/protein_summary.rb +0 -489
  89. data/lib/spec_id/sequest/params.rb +0 -316
  90. data/lib/spec_id/sequest/pepxml.rb +0 -1458
  91. data/lib/spec_id/sequest.rb +0 -33
  92. data/lib/spec_id/sqt.rb +0 -349
  93. data/lib/spec_id/srf.rb +0 -973
  94. data/lib/spec_id.rb +0 -778
  95. data/lib/spec_id_xml.rb +0 -99
  96. data/lib/transmem/phobius.rb +0 -147
  97. data/lib/transmem/toppred.rb +0 -368
  98. data/lib/transmem.rb +0 -157
  99. data/lib/validator/aa.rb +0 -48
  100. data/lib/validator/aa_est.rb +0 -112
  101. data/lib/validator/background.rb +0 -77
  102. data/lib/validator/bias.rb +0 -95
  103. data/lib/validator/cmdline.rb +0 -431
  104. data/lib/validator/decoy.rb +0 -107
  105. data/lib/validator/digestion_based.rb +0 -70
  106. data/lib/validator/probability.rb +0 -51
  107. data/lib/validator/prot_from_pep.rb +0 -234
  108. data/lib/validator/q_value.rb +0 -32
  109. data/lib/validator/transmem.rb +0 -272
  110. data/lib/validator/true_pos.rb +0 -46
  111. data/lib/validator.rb +0 -197
  112. data/lib/xml.rb +0 -38
  113. data/lib/xml_style_parser.rb +0 -119
  114. data/lib/xmlparser_wrapper.rb +0 -19
  115. data/release_notes.txt +0 -2
  116. data/script/compile_and_plot_smriti_final.rb +0 -97
  117. data/script/create_little_pepxml.rb +0 -61
  118. data/script/degenerate_peptides.rb +0 -47
  119. data/script/estimate_fpr_by_cysteine.rb +0 -226
  120. data/script/extract_gradient_programs.rb +0 -56
  121. data/script/find_cysteine_background.rb +0 -137
  122. data/script/genuine_tps_and_probs.rb +0 -136
  123. data/script/get_apex_values_rexml.rb +0 -44
  124. data/script/histogram_probs.rb +0 -61
  125. data/script/mascot_fix_pepxml.rb +0 -123
  126. data/script/msvis.rb +0 -42
  127. data/script/mzXML2timeIndex.rb +0 -25
  128. data/script/peps_per_bin.rb +0 -67
  129. data/script/prep_dir.rb +0 -121
  130. data/script/simple_protein_digestion.rb +0 -27
  131. data/script/smriti_final_analysis.rb +0 -103
  132. data/script/sqt_to_meta.rb +0 -24
  133. data/script/top_hit_per_scan.rb +0 -67
  134. data/script/toppred_to_yaml.rb +0 -47
  135. data/script/tpp_installer.rb +0 -249
  136. data/specs/align_spec.rb +0 -79
  137. data/specs/bin/bioworks_to_pepxml_spec.rb +0 -79
  138. data/specs/bin/fasta_shaker_spec.rb +0 -259
  139. data/specs/bin/filter_and_validate__multiple_vals_helper.yaml +0 -199
  140. data/specs/bin/filter_and_validate_spec.rb +0 -180
  141. data/specs/bin/ms_to_lmat_spec.rb +0 -34
  142. data/specs/bin/prob_validate_spec.rb +0 -86
  143. data/specs/bin/protein_summary_spec.rb +0 -14
  144. data/specs/fasta_spec.rb +0 -354
  145. data/specs/gi_spec.rb +0 -22
  146. data/specs/load_bin_path.rb +0 -7
  147. data/specs/merge_deep_spec.rb +0 -13
  148. data/specs/ms/gradient_program_spec.rb +0 -77
  149. data/specs/ms/msrun_spec.rb +0 -498
  150. data/specs/ms/parser_spec.rb +0 -92
  151. data/specs/ms/spectrum_spec.rb +0 -87
  152. data/specs/pi_zero_spec.rb +0 -115
  153. data/specs/qvalue_spec.rb +0 -39
  154. data/specs/roc_spec.rb +0 -251
  155. data/specs/rspec_autotest.rb +0 -149
  156. data/specs/sample_enzyme_spec.rb +0 -126
  157. data/specs/spec_helper.rb +0 -135
  158. data/specs/spec_id/aa_freqs_spec.rb +0 -52
  159. data/specs/spec_id/bioworks_spec.rb +0 -148
  160. data/specs/spec_id/digestor_spec.rb +0 -75
  161. data/specs/spec_id/precision/filter/cmdline_spec.rb +0 -20
  162. data/specs/spec_id/precision/filter/output_spec.rb +0 -31
  163. data/specs/spec_id/precision/filter_spec.rb +0 -246
  164. data/specs/spec_id/precision/prob_spec.rb +0 -44
  165. data/specs/spec_id/precision/prob_spec_helper.rb +0 -0
  166. data/specs/spec_id/proph/pep_summary_spec.rb +0 -98
  167. data/specs/spec_id/proph/prot_summary_spec.rb +0 -128
  168. data/specs/spec_id/protein_summary_spec.rb +0 -189
  169. data/specs/spec_id/sequest/params_spec.rb +0 -68
  170. data/specs/spec_id/sequest/pepxml_spec.rb +0 -374
  171. data/specs/spec_id/sequest_spec.rb +0 -38
  172. data/specs/spec_id/sqt_spec.rb +0 -246
  173. data/specs/spec_id/srf_spec.rb +0 -172
  174. data/specs/spec_id/srf_spec_helper.rb +0 -139
  175. data/specs/spec_id_helper.rb +0 -33
  176. data/specs/spec_id_spec.rb +0 -366
  177. data/specs/spec_id_xml_spec.rb +0 -33
  178. data/specs/transmem/phobius_spec.rb +0 -425
  179. data/specs/transmem/toppred_spec.rb +0 -298
  180. data/specs/transmem_spec.rb +0 -60
  181. data/specs/transmem_spec_shared.rb +0 -64
  182. data/specs/validator/aa_est_spec.rb +0 -66
  183. data/specs/validator/aa_spec.rb +0 -40
  184. data/specs/validator/background_spec.rb +0 -67
  185. data/specs/validator/bias_spec.rb +0 -122
  186. data/specs/validator/decoy_spec.rb +0 -51
  187. data/specs/validator/fasta_helper.rb +0 -26
  188. data/specs/validator/prot_from_pep_spec.rb +0 -141
  189. data/specs/validator/transmem_spec.rb +0 -146
  190. data/specs/validator/true_pos_spec.rb +0 -58
  191. data/specs/validator_helper.rb +0 -33
  192. data/specs/xml_spec.rb +0 -12
  193. data/test_files/000_pepxml18_small.xml +0 -206
  194. data/test_files/020a.mzXML.timeIndex +0 -4710
  195. data/test_files/4-03-03_mzXML/000.mzXML.timeIndex +0 -3973
  196. data/test_files/4-03-03_mzXML/020.mzXML.timeIndex +0 -3872
  197. data/test_files/4-03-03_small-prot.xml +0 -321
  198. data/test_files/4-03-03_small.xml +0 -3876
  199. data/test_files/7MIX_STD_110802_1.sequest_params_fragment.srf +0 -0
  200. data/test_files/bioworks-3.3_10prots.xml +0 -5999
  201. data/test_files/bioworks31.params +0 -77
  202. data/test_files/bioworks32.params +0 -62
  203. data/test_files/bioworks33.params +0 -63
  204. data/test_files/bioworks_single_run_small.xml +0 -7237
  205. data/test_files/bioworks_small.fasta +0 -212
  206. data/test_files/bioworks_small.params +0 -63
  207. data/test_files/bioworks_small.phobius +0 -109
  208. data/test_files/bioworks_small.toppred.out +0 -2847
  209. data/test_files/bioworks_small.xml +0 -5610
  210. data/test_files/bioworks_with_INV_small.xml +0 -3753
  211. data/test_files/bioworks_with_SHUFF_small.xml +0 -2503
  212. data/test_files/corrupted_900.srf +0 -0
  213. data/test_files/head_of_7MIX.srf +0 -0
  214. data/test_files/interact-opd1_mods_small-prot.xml +0 -304
  215. data/test_files/messups.fasta +0 -297
  216. data/test_files/opd1/000.my_answer.100lines.xml +0 -101
  217. data/test_files/opd1/000.tpp_1.2.3.first10.xml +0 -115
  218. data/test_files/opd1/000.tpp_2.9.2.first10.xml +0 -126
  219. data/test_files/opd1/000.v2.1.mzXML.timeIndex +0 -3748
  220. data/test_files/opd1/000_020-prot.png +0 -0
  221. data/test_files/opd1/000_020_3prots-prot.mod_initprob.xml +0 -62
  222. data/test_files/opd1/000_020_3prots-prot.xml +0 -62
  223. data/test_files/opd1/opd1_cat_inv_small-prot.xml +0 -139
  224. data/test_files/opd1/sequest.3.1.params +0 -77
  225. data/test_files/opd1/sequest.3.2.params +0 -62
  226. data/test_files/opd1/twenty_scans.mzXML +0 -418
  227. data/test_files/opd1/twenty_scans.v2.1.mzXML +0 -382
  228. data/test_files/opd1/twenty_scans_answ.lmat +0 -0
  229. data/test_files/opd1/twenty_scans_answ.lmata +0 -9
  230. data/test_files/opd1_020_beginning.RAW +0 -0
  231. data/test_files/opd1_2runs_2mods/data/020.mzData.xml +0 -683
  232. data/test_files/opd1_2runs_2mods/data/020.readw.mzXML +0 -382
  233. data/test_files/opd1_2runs_2mods/data/040.mzData.xml +0 -683
  234. data/test_files/opd1_2runs_2mods/data/040.readw.mzXML +0 -382
  235. data/test_files/opd1_2runs_2mods/data/README.txt +0 -6
  236. data/test_files/opd1_2runs_2mods/interact-opd1_mods__small.xml +0 -753
  237. data/test_files/orbitrap_mzData/000_cut.xml +0 -1920
  238. data/test_files/pepproph_small.xml +0 -4691
  239. data/test_files/phobius.small.noheader.txt +0 -50
  240. data/test_files/phobius.small.small.txt +0 -53
  241. data/test_files/s01_anC1_ld020mM.key.txt +0 -25
  242. data/test_files/s01_anC1_ld020mM.meth +0 -0
  243. data/test_files/small.fasta +0 -297
  244. data/test_files/small.sqt +0 -87
  245. data/test_files/smallraw.RAW +0 -0
  246. data/test_files/tf_bioworks2excel.bioXML +0 -14340
  247. data/test_files/tf_bioworks2excel.txt.actual +0 -1035
  248. data/test_files/toppred.small.out +0 -416
  249. data/test_files/toppred.xml.out +0 -318
  250. data/test_files/validator_hits_separate/bias_bioworks_small_HS.fasta +0 -7
  251. data/test_files/validator_hits_separate/bioworks_small_HS.xml +0 -5651
  252. data/test_files/yeast_gly_small-prot.xml +0 -265
  253. data/test_files/yeast_gly_small.1.0_1.0_1.0.parentTimes +0 -6
  254. data/test_files/yeast_gly_small.xml +0 -3807
  255. data/test_files/yeast_gly_small2.parentTimes +0 -6
data/lib/transmem.rb DELETED
@@ -1,157 +0,0 @@
1
-
2
- # A transmemIndex is a hash that takes a fasta reference as key and returns
3
- # a structured hash containing the transmembrane information.
4
- module TransmemIndex
5
-
6
- # returns :toppred or :phobius
7
- def self.filetype(file)
8
- tp = nil
9
- File.open(file) do |fh|
10
- while (line = fh.gets)
11
- case line
12
- when /SEQENCE/
13
- tp = :phobius
14
- break
15
- when / 0 0 i/
16
- tp = :phobius # if they don't have the headers,
17
- # this will pick it up if they have a
18
- # single prot without tm or signal peptide.
19
- break
20
- when /Algorithm specific parameters/
21
- tp = :toppred # New text
22
- break
23
- when /<parameters>/
24
- tp = :toppred # XML
25
- break
26
- end
27
- end
28
- end
29
- tp
30
- end
31
-
32
- def reference_to_key(reference)
33
- # needs to be subclassed or written
34
- end
35
-
36
- # right now accepts toppred.out files
37
- # Phobius objects can use the fasta object to update their hash for methods
38
- # like avg_overlap
39
- def self.new(file, fasta=nil)
40
- case x = filetype(file)
41
- when :toppred
42
- require 'transmem/toppred'
43
- TopPred::Index.new(file)
44
- when :phobius
45
- require 'transmem/phobius'
46
- # warn "WARNING: You have NO fasta object with Phobius based TransmemIndex! (which needs one to do proper indexing!)" unless fasta
47
- Phobius::Index.new(file, fasta)
48
- else
49
- raise ArgumentError, "#{x} filetype for #{file} not recognized!"
50
- end
51
- end
52
-
53
- # returns a hash of key -> num certain transmembrane segments
54
- def num_certain_index
55
- hash = {}
56
- self.each do |k,v|
57
- hash[k] = v[:num_certain_transmembrane_segments] || 0
58
- end
59
- hash
60
- end
61
-
62
- # tp = :number or :fraction which is the fraction of the sequence size
63
- # returns the average number of overlapping amino acids with transmembrane
64
- # segments
65
- # returns nil if there is no protein by that key
66
- def avg_overlap(key, sequence, tp=:number)
67
- if self.key? key
68
- numbers = num_transmem_aa(self[key], sequence)
69
- if numbers.size > 0
70
- sum = 0
71
- numbers.each {|num| sum += num}
72
- avg_num = sum.to_f / numbers.size
73
- # the one line way to do it
74
- #avg_num = numbers.inject(0) {|memo,num| num + memo }.to_f / numbers.size
75
- if tp == :fraction
76
- avg_num / sequence.size
77
- # this is the same as doing this:
78
- #numbers.inject(0.0) {|memo,num| (num.to_f/seq_size + memo) } / numbers.size
79
- else
80
- avg_num
81
- end
82
- else
83
- 0.0
84
- end
85
- else # what to do if the protein isn't there?? which happens on occasion
86
- nil
87
- end
88
- end
89
-
90
- # returns an array (usually length of 1) of the number of amino acids
91
- # contained inside transmembrane spanning segments.
92
- # assumes that tmhash has the key 'transmembrane_segments'
93
- # if there are no transmembrane segments, returns empty array.
94
- def num_transmem_aa(tmhash, sequence)
95
- if tmhash.key? :transmembrane_segments
96
- ranges = tmhash[:transmembrane_segments].map do |tmseg|
97
- Range.new( tmseg[:start]-1, tmseg[:stop]-1 )
98
- end
99
- num_overlapping_chars(tmhash[:aaseq], ranges, sequence)
100
- else
101
- []
102
- end
103
- end
104
-
105
- # returns an array of the number of overlapping sequences in substring with
106
- # the substrings defined in start_stop_doublets within full_sequence
107
- # start_stop_doublets should be 0 indexed!!!
108
- # the span includes the 'stop' position i.e., full_sequence[start..stop]
109
- def num_overlapping_chars(full_sequence, ranges, substring)
110
- #start_positions = aaseq.enum_for(:scan, substring).map { $~.offset(0)[0]}
111
- if ranges.size == 0
112
- []
113
- #full_sequence.enum_for(:scan, substring).map { 0 }
114
- else
115
- substring_ranges = []
116
- pos = 0
117
- slen = substring.size
118
- while i=full_sequence.index(substring,pos)
119
- substring_ranges << Range.new(i, i+slen-1)
120
- pos = i + slen
121
- end
122
- # brute force way
123
- last_tm_range = ranges.last.last
124
- to_return = substring_ranges.map do |sb|
125
- overlap = 0
126
- # there's got to be a much simpler way to do this, but this does work...
127
- ranges.each do |tm|
128
- (frst, lst) =
129
- if tm.include?( sb.first )
130
- [tm, sb]
131
- elsif tm.include?( sb.last )
132
- [sb, tm]
133
- else
134
- nil
135
- end
136
- if frst
137
- if lst.last <= frst.last
138
- overlap += (frst.last+1 - frst.first) - (lst.first - frst.first) - (frst.last - lst.last)
139
- else
140
- overlap += (frst.last+1 - frst.first) - (lst.first - frst.first)
141
- end
142
- end
143
- end
144
- overlap
145
- end
146
- end
147
- end
148
-
149
-
150
- end
151
-
152
-
153
- #substring_ranges = full_sequence.enum_for(:scan, substring).map do
154
- # (ofirst, olast) = $~.offset(0)
155
- # Range.new(ofirst, olast - 1)
156
- # end
157
-
data/lib/validator/aa.rb DELETED
@@ -1,48 +0,0 @@
1
- require 'validator/digestion_based'
2
- require 'fasta'
3
- require 'spec_id/aa_freqs'
4
-
5
- # Constraints on aaseq attribute of peptides (the bare amino acid sequence)
6
- # works by calculating amino acid frequencies in the fasta file used.
7
- class Validator::AA < Validator::DigestionBased
8
- include Precision::Calculator
9
-
10
- attr_accessor :constraint
11
-
12
- # it is a false hit if the amino acid is located in the peptide
13
- attr_accessor :false_if_found
14
-
15
- DEFAULTS = Validator::DigestionBased::DEFAULTS.merge( {
16
- :false_if_found => true,
17
- } )
18
-
19
- # returns tp, fp
20
- def partition(peps)
21
- (found, not_found) = peps.partition do |pep|
22
- pep.aaseq.include?(@constraint)
23
- end
24
- if @false_if_found
25
- [not_found, found]
26
- else
27
- [found, not_found]
28
- end
29
- end
30
-
31
- # right now only accepts single amino acids as constraints (as a string,
32
- # e.g. 'C', or symbol, e.g. :C)
33
- # options:
34
- # :false_to_total_ratio => if a true digestion was already performed (see
35
- # Validator::AA.calc_false_to_total_ratio)
36
- # :false_if_found => it is a false positive if the amino acid is found.
37
- # :background => the background level of amino acid Float
38
- def initialize(constraint, options={})
39
- @constraint = constraint.to_s
40
- opts = DEFAULTS.merge(options)
41
- (@false_to_total_ratio, @false_if_found, @background) = opts.values_at(:false_to_total_ratio, :false_if_found, :background)
42
- end
43
-
44
- def to_param_string
45
- "aminoacid(bad_aa)=" + ["{constraint=#{@constraint}", "false_to_total_ratio=#{@false_to_total_ratio}", "bkg=#{(@background ? @background : 0.0) }}"].join(", ")
46
- end
47
- end
48
-
@@ -1,112 +0,0 @@
1
- require 'validator/aa'
2
-
3
-
4
- class Validator ; end
5
- class Validator::AA ; end
6
-
7
- # A class that uses the peps given to it and a background frequency to
8
- # calculate the false_to_total_ratio at each turn.
9
- class Validator::AAEst < Validator::AA
10
- attr_accessor :constraint
11
- attr_accessor :false_if_found
12
-
13
- # the frequency of the amino acid is used to estimate the false to
14
- # total ratio based on the pephits given for pephit_precision.
15
- # see Validator::AA.calc_frequency to calculate a frequency
16
- # or use set_frequency to set from pep hits.
17
- attr_accessor :frequency
18
-
19
- DEFAULTS = {
20
- :false_if_found => true
21
- }.merge(Validator::DigestionBased::DEFAULTS) # background 0.0
22
-
23
- # only takes a string right now for constraint
24
- def initialize(constraint, options={})
25
- @constraint = constraint.to_s
26
- opts = DEFAULTS.merge(options)
27
- (@frequency, @false_if_found, @background) = opts.values_at(:frequency, :false_if_found, :background)
28
- end
29
-
30
- def pephit_precision(peps)
31
- set_false_to_total_ratio(peps)
32
- super(peps)
33
- end
34
-
35
- def set_false_to_total_ratio(peps)
36
- if peps.size > 0
37
- expected = 0.0
38
- peps.each do |pep|
39
- expected += (1.0 - ((1.0 - @frequency)**pep.aaseq.size))
40
- end
41
- @false_to_total_ratio = expected / peps.size
42
- else
43
- @false_to_total_ratio = 1.0
44
- end
45
- end
46
-
47
- def set_ongoing_false_to_total_ratio(peps)
48
- if peps.size > 0
49
- peps.each do |pep|
50
- @expected += (1.0 - ((1.0-@frequency)**pep.aaseq.size))
51
- end
52
- # @increment_total_submitted should == @increment_tps and @increment_fps
53
- # since these are either/or
54
- @false_to_total_ratio = @expected / @increment_total_submitted
55
- else
56
- @false_to_total_ratio = 1.0
57
- end
58
- end
59
-
60
-
61
- def to_param_string
62
- "aminoacid(bad_aa)=" + ["{constraint=#{@constraint}", "frequency=#{@frequency}", "bkg=#{(@background ? @background : 0.0) }}"].join(", ")
63
- end
64
-
65
- # takes objects responding to aaseq and sets the frequency based on
66
- # constraint. constraint is one acceptable to initialize! returns self
67
- def set_frequency(objs)
68
- table = SpecID::AAFreqs.new.calculate_frequencies(objs)
69
- @frequency = table[@constraint.to_sym]
70
- self
71
- end
72
-
73
- # if adding pephits in groups at a time, the entire group does not need to be
74
- # queried, just the individual hit. Use this OR pephits_precision (NOT
75
- # both). The initial query to this method will begin a running tally that
76
- # is saved by the validator.
77
- # takes either an array or a single pephit (determined by if it is a
78
- # SpecID::Pep)
79
- def increment_pephits_precision(peps)
80
- tmp = $VERBOSE; $VERBOSE = nil
81
- unless @increment_initialized
82
- initialize_increment
83
- @expected = 0.0
84
- end
85
- $VERBOSE = tmp
86
-
87
- to_submit =
88
- if peps.is_a? SpecID::Pep
89
- [peps]
90
- else
91
- peps
92
- end
93
- @increment_total_submitted += to_submit.size
94
- (tps, fps) = partition(to_submit)
95
- #### THIS IS THE MAGIC FOR THIS VALIDATOR:
96
- set_ongoing_false_to_total_ratio(to_submit)
97
-
98
- @increment_tps += tps.size
99
- @increment_fps += fps.size
100
- (num_tps, num_fps) =
101
- if self.respond_to?(:calc_precision_prep) # for digestion based validators
102
- (num_tps, num_fps) = calc_precision_prep(@increment_tps, @increment_fps)
103
- [num_tps, num_fps]
104
- else
105
- [@increment_tps, @increment_fps]
106
- end
107
- calc_precision(num_tps, num_fps)
108
- end
109
-
110
-
111
-
112
- end
@@ -1,77 +0,0 @@
1
- require 'validator'
2
- require 'vec'
3
- require 'enumerator'
4
-
5
- class Validator ; end
6
- class Validator::Background
7
-
8
- attr_accessor :data
9
-
10
- def initialize(data=nil)
11
- @data = data
12
- end
13
-
14
- def delete_nan!(vec)
15
- vec.each_with_index do |v,i|
16
- if v.nan?
17
- vec[i] = 0
18
- end
19
- end
20
- end
21
-
22
- def stdev_plus_spread(stdev_factor=2.0, stdev_points=15, min_window_pre=5, min_window_post=5)
23
- data_vec = VecD[*@data]
24
- delete_nan!(data_vec)
25
- stdev_transform = data_vec.transform(9) {|vec| (stdev_factor * vec.sample_stats[1]) + vec.spread }
26
- smoothed_stdev = stdev_transform.transform(9) {|vec| vec.avg }
27
- smoothed_stdev_derivs = smoothed_stdev.chim
28
- last_0_index = index_of_last_0(smoothed_stdev_derivs)
29
- min_in_window(data_vec, last_0_index, min_window_pre, min_window_post)
30
- end
31
-
32
- def plot(vec)
33
- `graph #{vec.join(" ")} -a -T X`
34
- end
35
-
36
- # not really working right currently
37
- def derivs(avg_points=15, min_window_pre=5, min_window_post=5)
38
- data_vec = VecD[*@data]
39
- delete_nan!(data_vec)
40
- drvs = data_vec.chim
41
- # absolute value
42
- drvs.each_with_index {|x,i| drvs[i] = x.abs }
43
- mv_avg = drvs.transform(avg_points) {|v| v.avg }
44
- last_0_index = index_of_last_0(mv_avg.chim)
45
- min_in_window(data_vec, last_0_index, min_window_pre, min_window_post)
46
- end
47
-
48
- def index_of_last_0(vec)
49
- last_0_index = nil
50
- vec.each_with_index do |v,i|
51
- if v == 0
52
- last_0_index = i
53
- end
54
- end
55
- last_0_index
56
- end
57
-
58
- # returns the minimum value in the window centered on index
59
- def min_in_window(vec, index, pre, post)
60
- last_index = vec.size - 1
61
- start = index - pre
62
- stop = index + post
63
- start = 0 if start < 0
64
- stop = last_index if stop > last_index
65
- vec[start..stop].min
66
- end
67
-
68
- # very simple, should work
69
- def min_mesa(start, stop, points=3)
70
- data_vec = VecD[*@data]
71
- delete_nan!(data_vec)
72
- smoothed = data_vec.transform(3) {|v| v.avg }
73
- smoothed[start..stop].min
74
- end
75
-
76
- end
77
-
@@ -1,95 +0,0 @@
1
- require 'validator'
2
- require 'validator/digestion_based'
3
-
4
- # class for any generic kind of bias. For instance, a list of high abundance
5
- # proteins we would expect to see, or a list of low abundance proteins we
6
- # would not expect to see, or proteins that have been filtered out in some
7
- # way, etc.
8
- class Validator::Bias < Validator::DigestionBased
9
- include Precision::Calculator
10
-
11
- # a fasta object (by default containing proteins expected to be in the
12
- # sample [see proteins_expected to modify that behavior])
13
- attr_reader :fasta
14
-
15
- # correct_wins means that only a single protein from a pep.aaseq must match
16
- # the fasta object for the pep hit to be considered valid. Otherwise, all
17
- # must be a match (logic negated by proteins_expected)
18
- attr_accessor :correct_wins
19
-
20
- # proteins_expected==true means we expect to see the proteins in the sample
21
- # proteins_expected==false means we do not expect to see these proteins in
22
- # the sample
23
- attr_accessor :proteins_expected
24
-
25
- # a hash made by taking each fasta reference in fasta_object, (everything
26
- # until a space) and setting the value to true. It can be queried with the
27
- # start of an fasta sequence
28
- attr_accessor :short_reference_hash
29
-
30
- DEFAULTS = Validator::DigestionBased::DEFAULTS.merge( {
31
- :proteins_expected => true,
32
- :correct_wins => true,
33
- } )
34
-
35
- # options:
36
- # (t = true, f = false, '*'= default)
37
- # :proteins_expected => *t/f we expect to see the fasta proteins in our hit list
38
- # :correct_wins => *t/f a single peptide hit from one of these proteins
39
- # constitutes a true positive
40
- # :background => Float (*0.0-1.0)
41
- # :false_to_total_ratio => Float (*nil by default)
42
- def initialize(fasta_object, options={})
43
- opts = DEFAULTS.merge(options)
44
- (@proteins_expected, @correct_wins, @background, @false_to_total_ratio) = opts.values_at(:proteins_expected, :correct_wins, :background, :false_to_total_ratio)
45
- @fasta = fasta_object
46
- @header_split_hash = @fasta.prots.map {|prot| prot.reference }
47
- @short_reference_hash = self.class.make_short_reference_hash(fasta_object)
48
- end
49
-
50
- def self.make_short_reference_hash(fasta_object)
51
- hash = {}
52
- fasta_object.each do |prot|
53
- hash[prot.first_entry] = true
54
- end
55
- hash
56
- end
57
-
58
- def partition(peps)
59
- klass = self.class
60
- cw =
61
- if !@proteins_expected
62
- !@correct_wins
63
- else
64
- @correct_wins
65
- end
66
-
67
- (tp, fp) =
68
- if cw
69
- peps.partition do |pep|
70
- pep.prots.any? do |pepprot|
71
- @short_reference_hash.key?( pepprot.first_entry )
72
- end
73
- end
74
- else
75
- peps.partition do |pep|
76
- pep.prots.any? do |pepprot|
77
- !@short_reference_hash.key?( pepprot.first_entry )
78
- end
79
- end
80
- end
81
-
82
- if !@correct_wins
83
- tp, fp = fp, tp
84
- end
85
-
86
- [tp, fp]
87
- end
88
-
89
- # pephit_precision is done through inheritance
90
-
91
- def to_param_string
92
- "abundance=" + ["{fasta=#{@fasta.filename}", "proteins_expected=#{@proteins_expected}", "correct_wins=#{@correct_wins}", "background=#{@background}}"].join(", ")
93
- end
94
-
95
- end