rbbt-dm 1.0.0 → 1.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml ADDED
@@ -0,0 +1,15 @@
1
+ ---
2
+ !binary "U0hBMQ==":
3
+ metadata.gz: !binary |-
4
+ ZGQzNzI5ZWRiMzBkMTRjNGIxZmUxYTEwMDAyZDEyZjA1NTI0NTI1NQ==
5
+ data.tar.gz: !binary |-
6
+ YTQwYjdlNzU0NGY4ODUzMTBjNDgzMjcxMTAwOWY1OTY2OGM2YjY0Ng==
7
+ !binary "U0hBNTEy":
8
+ metadata.gz: !binary |-
9
+ ZDFkMTZiMjMxYWE2NjYxZDFlYzFmNjZmMmQzZDhlNWQ5YzhlYzlhNDEzMmI0
10
+ OGIzYzJkNGFkMjYyMjdjYjVmMjNiNTk4ZTdhYTQzOGJkNjM2MjEwZWJhOTgz
11
+ YWViMDA3NTY2MDE2NTM0YjRhOWMwYjk2Y2RlMGMxMzU4MjQ2YjE=
12
+ data.tar.gz: !binary |-
13
+ MjhmNjc2ODQ5MTZhOWZjNmE0YTQxYzJkN2UzZWQyYjNhZmZkMGJjNjliYTc2
14
+ NjQwOTk0YjZiMjRmNjFkZjQzZGM3OTAwYmYxZWJhOTM1MTI5ZmNmNzVlNTE3
15
+ MTE4YThkZmE2ZjVkMjViNzY5ZWI0NmZiMmYwYTY4Y2Y2NDFjMzU=
data/LICENSE CHANGED
@@ -1,4 +1,4 @@
1
- Copyright (c) 2010-2011 Miguel Vázquez García
1
+ Copyright (c) 2010-2013 Miguel Vázquez García
2
2
 
3
3
  Permission is hereby granted, free of charge, to any person obtaining
4
4
  a copy of this software and associated documentation files (the
@@ -0,0 +1,99 @@
1
+ require 'rbbt'
2
+ require 'rbbt/tsv'
3
+ require 'rbbt/GE'
4
+
5
+ module Expression
6
+ extend Workflow
7
+
8
+ def self.load_matrix(data_file, identifier_file, identifier_format, organism)
9
+ log :open_data, "Opening data file"
10
+ data = TSV.open(data_file, :type => :double, :unnamed => true)
11
+
12
+ organism ||= data.namespace
13
+
14
+ if not (identifier_file.nil? or identifier_format.nil? or data.key_field == identifier_format)
15
+
16
+ case
17
+ when (fields = (TSV.parse_header(Open.open(identifier_file)).fields) and fields.include?(identifier_format))
18
+ log :attach, "Adding #{ identifier_format } from #{ identifier_file }"
19
+ data = data.attach identifier_file, :fields => [identifier_format]
20
+ log :reorder, "Reordering data fields"
21
+ data = data.reorder identifier_format, data.fields.dup.delete_if{|field| field == identifier_format}
22
+ else
23
+ raise "No organism defined and identifier_format did not match available formats" if organism.nil?
24
+ require 'rbbt/sources/organism'
25
+ organism_identifiers = Organism.identifiers(organism)
26
+ data.identifiers = identifier_file
27
+ log :attach, "Adding #{ identifier_format } from #{ organism_identifiers }"
28
+ data = data.attach organism_identifiers, :fields => [identifier_format]
29
+ log :reorder, "Reordering data fields"
30
+ data = data.reorder identifier_format, data.fields.dup.delete_if{|field| field == identifier_format}
31
+ data
32
+ end
33
+
34
+ new_data = TSV.setup({}, :key_field => data.key_field, :fields => data.fields, :type => :list, :cast => :to_f, :namespace => organism, :unnamed => true)
35
+ log :averaging, "Averaging multiple values"
36
+ data.with_unnamed do
37
+ data.through do |key, values|
38
+ new_data[key] = values.collect{|list| Misc.mean(list.collect{|v| v.to_f})}
39
+ end
40
+ end
41
+
42
+ data = new_data
43
+ else
44
+ log :ready, "Matrix ready"
45
+ end
46
+
47
+ data
48
+ end
49
+
50
+ def self.average_samples(matrix_file, samples)
51
+ matrix = TSV.open(matrix_file)
52
+ new = TSV.setup({}, :key_field => matrix.key_field, :fields => matrix.fields, :cast => matrix.cast, :namespace => matrix.namespace)
53
+ positions = samples.collect{|sample| matrix.identify_field sample}.compact
54
+ matrix.with_unnamed do
55
+ matrix.through do |key,values|
56
+ new[key] = Misc.mean(values.values_at(*positions).compact)
57
+ end
58
+ end
59
+
60
+ new
61
+ end
62
+
63
+ def self.differential(matrix_file, main, contrast, log2, two_channel)
64
+ header = TSV.parse_header(Open.open(matrix_file))
65
+ key_field, *fields = header.all_fields
66
+ namespace = header.namespace
67
+
68
+ main = main & fields
69
+ contrast = contrast & fields
70
+
71
+ if Step === self
72
+ GE.analyze(matrix_file, main, contrast, log2, path, key_field, two_channel)
73
+ TSV.open(path, :type => :list, :cast => :to_f, :namespace => namespace)
74
+ else
75
+ TmpFile.with_file do |path|
76
+ GE.analyze(matrix_file, main, contrast, log2, path, key_field, two_channel)
77
+ TSV.open(path, :type => :list, :cast => :to_f, :namespace => namespace)
78
+ end
79
+ end
80
+ end
81
+
82
+ def self.barcode(matrix_file, output_file, factor = 3)
83
+ GE.barcode(matrix_file, output_file, factor)
84
+ end
85
+
86
+ def self.top_up(diff_file, cutoff = 0.05)
87
+ TSV.open(diff_file, :cast => :to_f).select("adjusted.p.values"){|p| p > 0 and p < cutoff}
88
+ end
89
+
90
+ def self.top_down(diff_file, cutoff = 0.05)
91
+ cutoff = -cutoff
92
+ tsv = TSV.open(diff_file, :cast => :to_f).select("adjusted.p.values"){|p| p < 0 and p > cutoff}
93
+ tsv.each do |key,values|
94
+ tsv[key] = values.collect{|v| v.abs}
95
+ end
96
+ tsv
97
+ end
98
+ end
99
+
@@ -0,0 +1,164 @@
1
+ require 'rbbt'
2
+ require 'rbbt/util/misc'
3
+ require 'rbbt/persist'
4
+ require 'rbbt/workflow'
5
+ require 'rbbt/GE/GEO'
6
+ require 'rbbt/statistics/fdr'
7
+ require 'rbbt/expression/expression'
8
+ require 'rbbt/expression/signature'
9
+
10
+ class Matrix
11
+ extend Resource
12
+ self.subdir = "var/matrices"
13
+ MATRIX_DIR = Matrix.root.find
14
+
15
+ def self.geo_matrix_for(gds, key_field = nil, organism = nil)
16
+ data = GEO[gds].values.produce.find
17
+ samples = GEO[gds].samples.produce.find
18
+
19
+ dataset_info = GEO[gds]['info.yaml'].produce.yaml
20
+ platform = dataset_info[:platform]
21
+ identifiers = GEO[platform].codes.produce.find
22
+
23
+ log2 = ["count"].include? dataset_info[:value_type]
24
+
25
+ Matrix.new(data, identifiers, samples, key_field, organism, log2)
26
+ end
27
+
28
+ attr_accessor :data, :identifiers, :labels, :key_field, :organism, :samples, :log2, :channel
29
+ def initialize(data, identifiers, labels = nil, key_field = nil, organism = nil, log2 = false, channel = false)
30
+ data.produce if data.respond_to? :produce
31
+ @data = data
32
+ @samples = TSV::Parser.new(Open.open(data)).fields
33
+ @identifiers = identifiers
34
+ @labels = TSV.open(labels) unless labels.nil?
35
+ @key_field = key_field
36
+ @log2 = log2
37
+ @channel = channel
38
+ @organism = organism
39
+ end
40
+
41
+ def matrix_file(path = nil)
42
+ path ||= Persist.persistence_path(data, {:dir => Matrix::MATRIX_DIR}, {:identifiers => identifiers, :labels => labels, :key_field => key_field, :organism => organism})
43
+ Persist.persist(data, :tsv, :file => path, :check => [data], :no_load => true) do
44
+ matrix = Expression.load_matrix(data, identifiers, key_field, organism)
45
+ matrix = matrix.select(:key => Organism.sanctioned_genes(organism).list) if matrix.key_field == "Ensembl Gene ID"
46
+ matrix
47
+ end
48
+ path
49
+ end
50
+
51
+ def average_samples(samples)
52
+ path = Persist.persistence_path(matrix_file, {:dir => File.join(Matrix::MATRIX_DIR, 'averaged_samples')}, {:samples => samples})
53
+ Persist.persist(data, :tsv, :file => path, :no_load => true, :check => [matrix_file]) do
54
+ Expression.average_samples(matrix_file, samples)
55
+ end
56
+ path
57
+ end
58
+
59
+ def find_samples(value, field = nil)
60
+ labels.select(field){|k,v|
61
+ Array === v ? v.flatten.include?(value) : v == value
62
+ }.keys
63
+ end
64
+
65
+ def remove_missing(samples)
66
+ @samples & samples
67
+ end
68
+
69
+ def average_label(value, field = nil)
70
+ samples = find_samples(value, field)
71
+ samples = remove_missing(samples)
72
+ average_samples(samples)
73
+ end
74
+
75
+ def barcode(path = nil, factor = 2)
76
+ path ||= Persist.persistence_path(matrix_file, {:dir => File.join(Matrix::MATRIX_DIR, 'sample_differences')}, {:main => main, :contrast => contrast, :log2 => log2, :channel => channel})
77
+ Persist.persist(data, :tsv, :file => path, :no_load => true, :check => [matrix_file]) do
78
+ Expression.barcode(matrix_file, path, factor)
79
+ nil
80
+ end
81
+ path
82
+ end
83
+
84
+ def sample_differences(main, contrast)
85
+ path = Persist.persistence_path(matrix_file, {:dir => File.join(Matrix::MATRIX_DIR, 'sample_differences')}, {:main => main, :contrast => contrast, :log2 => log2, :channel => channel})
86
+ Persist.persist(data, :tsv, :file => path, :no_load => true, :check => [matrix_file]) do
87
+ Expression.differential(matrix_file, main, contrast, log2, channel)
88
+ end
89
+ path
90
+ end
91
+
92
+ def label_differences(main, contrast = nil, field = nil)
93
+ all_samples = labels.keys
94
+ main_samples = find_samples(main, field)
95
+ if contrast
96
+ contrast_samples = find_samples(contrast, field)
97
+ else
98
+ contrast_samples = all_samples - main_samples
99
+ end
100
+
101
+ main_samples = remove_missing(main_samples)
102
+ contrast_samples = remove_missing(contrast_samples)
103
+
104
+ sample_differences(main_samples, contrast_samples)
105
+ end
106
+
107
+ def signature_set(field, cast = nil)
108
+ path = Persist.persistence_path(matrix_file, {:dir => File.join(Matrix::MATRIX_DIR, 'signature_set')}, {:field => field, :cast => cast})
109
+ Persist.persist(data, :tsv, :file => path, :no_load => true, :check => [matrix_file]) do
110
+ signatures = TSV.open(matrix_file, :fields => [], :type => :list, :cast => cast)
111
+ labels.values.flatten.uniq.sort.each do |value|
112
+ begin
113
+ s = Signature.tsv_field(label_differences(value), field, cast)
114
+ s.fields = [value]
115
+ signatures.attach s
116
+ rescue Exception
117
+ Log.warn("Signature for #{ value } did not compute")
118
+ end
119
+ end
120
+ signatures
121
+ end
122
+ path
123
+ end
124
+
125
+ def random_forest_importance(main, contrast = nil, field = nil, options = {})
126
+ features = Misc.process_options options, :features
127
+ features ||= []
128
+
129
+ path = Persist.persistence_path(matrix_file, {:dir => File.join(Matrix::MATRIX_DIR, 'random_forest_importance')}, {:main => main, :contrast => contrast, :field => field, :features => features})
130
+ Persist.persist(data, :tsv, :file => path, :no_load => false, :check => [matrix_file]) do
131
+ all_samples = labels.keys
132
+ main_samples = find_samples(main, field)
133
+ if contrast
134
+ contrast_samples = find_samples(contrast, field)
135
+ else
136
+ contrast_samples = all_samples - main_samples
137
+ end
138
+
139
+
140
+ main_samples = remove_missing(main_samples)
141
+ contrast_samples = remove_missing(contrast_samples)
142
+
143
+ TmpFile.with_file do |result|
144
+ R.run <<-EOF
145
+ library(randomForest);
146
+ orig = rbbt.tsv('#{matrix_file}');
147
+ main = c('#{main_samples * "', '"}')
148
+ contrast = c('#{contrast_samples * "', '"}')
149
+ features = c('#{features * "', '"}')
150
+
151
+ features = intersect(features, rownames(orig));
152
+ data = t(orig[features, c(main, contrast)])
153
+ data = cbind(data, Class = 0)
154
+ data[main, "Class"] = 1
155
+
156
+ rf = randomForest(factor(Class) ~ ., data, na.action = na.exclude)
157
+ rbbt.tsv.write(rf$importance, filename='#{ result }', key.field = '#{@key_field}')
158
+ EOF
159
+
160
+ TSV.open(result, :type => :single, :cast => :to_f)
161
+ end
162
+ end
163
+ end
164
+ end
@@ -0,0 +1,151 @@
1
+ require 'rbbt/util/misc'
2
+
3
+ module Signature
4
+
5
+ def self.setup(hash, options = {})
6
+ hash.extend Signature
7
+ hash
8
+ end
9
+
10
+ def self.open(file, field = nil, options = {})
11
+ options = Misc.add_defaults options, :fields => nil, :cast => :to_f, :type => :single
12
+
13
+ options[:fields] ||= [field] if field
14
+
15
+ tsv = TSV.open(file, options)
16
+ tsv.extend Signature
17
+ tsv
18
+ end
19
+
20
+ def self.tsv_field(tsv, field, cast = nil)
21
+ tsv = TSV.open(tsv) unless TSV === tsv
22
+ Signature.setup(tsv.column(field, cast))
23
+ end
24
+
25
+ #{{{ Basic manipulation
26
+
27
+ def select(*args, &block)
28
+ Signature.setup(super(*args, &block))
29
+ end
30
+
31
+ def transform(&block)
32
+ case
33
+ when (block_given? and block.arity == 2)
34
+ self.each do |key, value|
35
+ self[key] = yield key, value
36
+ end
37
+ when (block_given? and block.arity == 1)
38
+ self.each do |key, value|
39
+ self[key] = yield value
40
+ end
41
+ else
42
+ raise "Block not given, or arity not 1 or 2"
43
+ end
44
+ self
45
+ end
46
+
47
+ def abs
48
+ transform{|value| value.abs}
49
+ end
50
+
51
+ def log
52
+ transform{|value| Math.log(value)}
53
+ end
54
+
55
+ def values_over(threshold)
56
+ entity_options = self.entity_options
57
+ entity_options[:organism] ||= self.namespace
58
+ Misc.prepare_entity(self.select{|k,v| v >= threshold}.collect{|k,v| k}, self.key_field, entity_options)
59
+ end
60
+
61
+ def values_under(threshold)
62
+ entity_options = self.entity_options
63
+ entity_options[:organism] ||= self.namespace
64
+ Misc.prepare_entity(self.select{|k,v| v <= threshold}.collect{|k,v| k}, self.key_field, entity_options)
65
+ end
66
+
67
+ #{{{ Rank stuff
68
+
69
+ def clean_empty
70
+ Signature.setup(select{|k,v| v.nil? ? false : (v.respond_to?(:empty) ? !v.empty? : true)}.tap{|s| s.unnamed = true})
71
+ end
72
+
73
+ def sorted
74
+ OrderedList.setup(clean_empty.sort_by{|elem,v| v}.collect{|elem,v| elem})
75
+ end
76
+
77
+ def ranks
78
+ ranks = TSV.setup({}, :key_field => self.key_field, :fields => ["Rank"], :cast => :to_i, :type => :single)
79
+ sorted.each_with_index do |elem, i|
80
+ ranks[elem] = i
81
+ end
82
+ ranks
83
+ end
84
+
85
+ #{{{ Pvalue stuff
86
+
87
+ def significant_pvalues(threshold)
88
+ entity_options = self.entity_options
89
+ entity_options[:organism] ||= self.namespace
90
+ if threshold > 0
91
+ Misc.prepare_entity(self.select{|k,v| v > 0 and v <= threshold}.collect{|k,v| k}, self.key_field, entity_options)
92
+ else
93
+ Misc.prepare_entity(self.select{|k,v| v < 0 and v >= threshold}.collect{|k,v| k}, self.key_field, entity_options)
94
+ end
95
+ end
96
+ def pvalue_fdr_adjust!
97
+ FDR.adjust_hash! self
98
+ self
99
+ end
100
+
101
+ def pvalue_score
102
+ transform{|value| value > 0 ? -Math.log(value + 0.00000001) : Math.log(-value + 0.00000001)}
103
+ end
104
+
105
+ def pvalue_sorted
106
+ OrderedList.setup(clean_empty.transform{|v| v.to_f}.sort{|a,b|
107
+ a = a[1]
108
+ b = b[1]
109
+ case
110
+ when a == b
111
+ 0
112
+ when (a <= 0 and b >= 0)
113
+ 1
114
+ when (a >= 0 and b <= 0)
115
+ -2
116
+ when a > 0
117
+ a.abs <=> b.abs
118
+ else
119
+ b.abs <=> a.abs
120
+ end
121
+
122
+ }.collect{|elem,v| elem})
123
+ end
124
+
125
+ def pvalue_sorted_weights
126
+ sorted = clean_empty.transform{|v| v.to_f}.sort{|a,b|
127
+ a = a[1]
128
+ b = b[1]
129
+ case
130
+ when a == b
131
+ 0
132
+ when (a <= 0 and b >= 0)
133
+ 1
134
+ when (a >= 0 and b <= 0)
135
+ -2
136
+ when a > 0
137
+ a.abs <=> b.abs
138
+ else
139
+ b.abs <=> a.abs
140
+ end
141
+ }
142
+
143
+ keys = []
144
+ weights = []
145
+ sorted.each{|k,v| keys << k; weights << - Math.log(v.abs)}
146
+
147
+ OrderedList.setup(keys, weights)
148
+ end
149
+
150
+
151
+ end
@@ -1,4 +1,5 @@
1
1
  require 'rbbt/util/R'
2
+ require 'rbbt/util/colorize'
2
3
 
3
4
  module Heatmap
4
5
  def self.heatmap(values, filename, options = {})
@@ -45,14 +46,71 @@ module Heatmap
45
46
  scale, take_log, add_to_height, colors = Misc.process_options options,
46
47
  :scale, :take_log, :add_to_height, :colors
47
48
 
48
- width = 200 + (values.fields.length * 16)
49
- height = 200 + (values.length * 16)
49
+ width = 1200 + (values.fields.length * 100)
50
+ height = 1000 + (values.length * 50)
50
51
  size = [width, height].max
51
52
  size = [size, 20000].min
53
+ width = [size, width].min
54
+ height = [size, height].min
55
+
56
+ take_log = take_log ? "TRUE" : "FALSE"
57
+ heatmap_script = <<-EOF
58
+ library(ggplot2);
59
+ rbbt.heatmap('#{filename}', #{ width }, #{ height }, data, take_log=#{take_log});
60
+ EOF
61
+
62
+ values.R heatmap_script
63
+
64
+ filename
65
+ end
66
+
67
+ def self.heatmap3(values, filename, options = {})
68
+ scale, take_log, add_to_height, colors = Misc.process_options options,
69
+ :scale, :take_log, :add_to_height, :colors
70
+
71
+ width = 1200 + (values.fields.length * 100)
72
+ height = 1000 + (values.length * 50)
73
+ size = [width, height].max
74
+ size = [size, 2000].min
75
+ width = [size, width].min
76
+ height = [size, height].min
77
+
78
+ take_log = take_log ? "TRUE" : "FALSE"
79
+
80
+ map = options.delete :map
81
+
82
+ if map
83
+ values = values.slice(map.keys)
84
+ clab = TSV.setup(map.keys, :type => :list, :fields => [], :key_field => map.key_field)
85
+
86
+ options[:keys] = []
87
+ options[:colors] = []
88
+ map.fields.each do |field|
89
+ color = Colorize.tsv map.slice(field)
90
+ clab.add_field field do |k, values|
91
+ color[k].to_rgb
92
+ end
93
+ options[:keys] << "" unless options[:keys].empty?
94
+ options[:keys].concat map.values.uniq
95
+ options[:colors] << "#000" unless options[:colors].empty?
96
+ options[:colors].concat color.values_at(*map.keys).collect{|c| c.to_rgb}.uniq
97
+ end
98
+
99
+ if options[:keys].length > 20
100
+ options.delete :keys
101
+ options.delete :colors
102
+ end
103
+
104
+ options[:ColSideColors] = clab
105
+ end
106
+
107
+ other_params = ", " << options.collect{|k,v| [R.ruby2R(k), R.ruby2R(v)] * "="} * ", " if options.any?
52
108
 
53
109
  heatmap_script = <<-EOF
54
- library(ggplot2);
110
+ library(ggplot2, quietly = TRUE, warn.conflicts = TRUE);
111
+ source('#{Rbbt.share.R["heatmap.3.R"].find}');
55
112
 
113
+ rbbt.heatmap.3('#{filename}', #{ width }, #{ height }, data, take_log=#{take_log}#{other_params});
56
114
  EOF
57
115
 
58
116
  values.R heatmap_script
@@ -60,4 +118,5 @@ module Heatmap
60
118
  filename
61
119
  end
62
120
 
121
+
63
122
  end
@@ -1,7 +1,9 @@
1
1
  require 'inline'
2
2
  require 'rsruby'
3
- require 'rbbt/tsv'
3
+ require 'rbbt'
4
+ require 'rbbt/persist'
4
5
  require 'rbbt/persist'
6
+ require 'rbbt/tsv'
5
7
  require 'rbbt/statistics/fdr'
6
8
  require 'rbbt/entity'
7
9
  require 'distribution'
@@ -111,52 +113,43 @@ module TSV
111
113
  fields = [fields] if String === fields or Symbol === fields
112
114
  rename = options.delete :rename
113
115
 
114
- Persist.persist(filename, :yaml, :fields => fields, :persist => persistence, :prefix => "Hyp.Geo.Counts", :other => { :rename => rename }) do
115
- data ||= Hash.new(0)
116
+ persistence_path = self.respond_to?(:persistence_path)? self.persistence_path : nil
117
+ Persist.persist(filename, :yaml, :fields => fields, :persist => persistence, :prefix => "Hyp.Geo.Counts", :other => {:rename => rename, :persistence_path => persistence_path}) do
118
+ data ||= {}
116
119
 
117
120
  with_unnamed do
118
121
 
119
122
  case type
120
123
  when :single
121
124
  through :key, fields do |key, value|
122
- next if value.nil?
123
- data[value] += 1
125
+ next if value.nil?
126
+ data[value] ||= []
127
+ data[value] << key
124
128
  end
125
129
  when :double
126
- if rename
127
- Log.debug("Computing annotation counts with rename: #{rename.values.flatten.compact.uniq.sort * ", "} ")
128
- through :key, fields do |key, values|
129
- next if values.nil?
130
- values.flatten.collect{|elem| rename.include?(elem)? rename[elem] : elem }.compact.uniq.each{|value| data[value] += 1 }
131
- end
132
- else
133
- through :key, fields do |key, values|
134
- values.flatten.compact.uniq.each{|value| data[value] += 1}
135
- end
130
+ through :key, fields do |key, values|
131
+ values.flatten.compact.uniq.each{|value| data[value] ||= []; data[value] << key}
136
132
  end
137
133
  when :list
138
134
  through :key, fields do |key, values|
139
135
  next if values.nil?
140
- values.compact.uniq.each{|value| data[value] += 1}
136
+ values.compact.uniq.each{|value| data[value] ||= []; data[value] << key}
141
137
  end
142
138
  when :flat
143
- if rename
144
- Log.debug("Computing annotation counts with rename: #{rename.values.flatten.compact.uniq.sort * ", "} ")
145
- through :key, fields do |key, values|
146
- next if values.nil?
147
- values.collect{|elem| rename.include?(elem)? rename[elem] : elem }.compact.uniq.each{|value| data[value] += 1 }
148
- end
149
- else
150
- through :key, fields do |key, values|
151
- next if values.nil?
152
- values.compact.uniq.each{|value| data[value] += 1}
153
- end
139
+ through :key, fields do |key, values|
140
+ next if values.nil?
141
+ values.compact.uniq.each{|value| data[value] ||= []; data[value] << key}
154
142
  end
155
143
  end
156
144
 
157
145
  end
158
146
 
159
- data
147
+ if rename
148
+ Log.debug("Using renames during annotation counts")
149
+ Hash[*data.keys.zip(data.values.collect{|l| l.collect{|e| rename.include?(e)? rename[e] : e }.uniq.length }).flatten]
150
+ else
151
+ Hash[*data.keys.zip(data.values.collect{|l| l.uniq.length}).flatten]
152
+ end
160
153
  end
161
154
  end
162
155
 
@@ -164,6 +157,8 @@ module TSV
164
157
  options = Misc.add_defaults options, :skip_missing => true, :background => nil
165
158
  background, skip_missing = Misc.process_options options, :background, :skip_missing
166
159
 
160
+ list = list.compact.uniq
161
+
167
162
  if Array === background and not background.empty?
168
163
  filter
169
164
  add_filter(:key, background)
@@ -174,13 +169,11 @@ module TSV
174
169
  end
175
170
  end
176
171
 
177
- list = list.compact.uniq
178
-
179
172
  with_unnamed do
180
173
  fields ||= self.fields.first
181
174
  options = Misc.add_defaults options, :min_support => 3, :fdr => true, :cutoff => false, :add_keys => true
182
175
 
183
- add_keys, rename = Misc.process_options options, :add_keys, :rename
176
+ add_keys, rename, masked = Misc.process_options options, :add_keys, :rename, :masked
184
177
 
185
178
  Log.debug "Enrichment analysis of field #{fields.inspect} for #{list.length} entities"
186
179
 
@@ -194,11 +187,11 @@ module TSV
194
187
  total = found
195
188
  Log.debug "Using #{ found } as sample size; skipping missing"
196
189
  else
197
- total = list.uniq.length
190
+ total = list.length
198
191
  Log.debug "Using #{ list.length } as sample size"
199
192
  end
200
193
 
201
- counts = annotation_counts fields, options[:persist], :rename => rename
194
+ counts = annotation_counts fields, options[:persist], :rename => rename, :masked => masked
202
195
 
203
196
  annotation_keys = Hash.new
204
197
  selected.with_unnamed do
@@ -230,14 +223,14 @@ module TSV
230
223
  end
231
224
 
232
225
  when :flat
233
- selected.through :key, fields do |key, values|
226
+ selected.through do |key, values|
227
+ next if values.nil?
234
228
  values.compact.uniq.reject{|value| value.empty?}.each{|value|
235
229
  value = value.dup
236
230
  annotation_keys[value] ||= []
237
231
  annotation_keys[value] << key
238
232
  }
239
233
  end
240
-
241
234
  end
242
235
 
243
236
  end
@@ -249,6 +242,7 @@ module TSV
249
242
 
250
243
  pvalues = {}
251
244
  annotation_keys.each do |annotation, elems|
245
+ next if masked and masked.include? annotation
252
246
  elems = elems.collect{|elem| rename.include?(elem)? rename[elem] : elem }.compact.uniq if rename
253
247
  count = elems.length
254
248
  next if count < options[:min_support] or not counts.include? annotation
@@ -53,6 +53,54 @@ module RandomWalk
53
53
  }
54
54
  EOC
55
55
 
56
+ builder.c_singleton <<-'EOC'
57
+ double score_plain_weight(VALUE positions, int total, int missing){
58
+ int idx;
59
+
60
+ int position;
61
+ double penalty;
62
+ double max_top, max_bottom;
63
+ double hit_weights = 0;
64
+
65
+ VALUE rel_l = rb_ary_new();
66
+ VALUE rel_q = rb_ary_new();
67
+
68
+ rb_ary_push(rel_q,rb_float_new(0));
69
+
70
+ // Rescale positions and accumulate weights
71
+
72
+ for (idx = 0; idx < RARRAY_LEN(positions); idx++){
73
+ position = FIX2INT(rb_ary_entry(positions, idx));
74
+
75
+ rb_ary_push(rel_l, rb_float_new((double) position / total));
76
+
77
+ hit_weights += 1;
78
+ rb_ary_push(rel_q, rb_float_new(hit_weights));
79
+ }
80
+
81
+ // Add penalty for missing genes
82
+ penalty = missing * 1;
83
+ hit_weights = hit_weights + penalty;
84
+
85
+ // Traverse list and get extreme values of:
86
+ // Proportion of weight covered - Proportion of hits covered
87
+
88
+ max_top = max_bottom = 0;
89
+ for (idx = 0; idx < RARRAY_LEN(positions); idx++){
90
+ double top = RFLOAT_VALUE(rb_ary_entry(rel_q, idx + 1)) / hit_weights -
91
+ RFLOAT_VALUE(rb_ary_entry(rel_l, idx));
92
+ double bottom = - (penalty + RFLOAT_VALUE(rb_ary_entry(rel_q, idx))) / hit_weights +
93
+ RFLOAT_VALUE(rb_ary_entry(rel_l, idx));
94
+
95
+ if (top > max_top) max_top = top;
96
+ if (bottom > max_bottom) max_bottom = bottom;
97
+ }
98
+
99
+ if (max_top > max_bottom) return max_top;
100
+ else return -max_bottom;
101
+ }
102
+ EOC
103
+
56
104
  builder.c_raw_singleton <<-'EOC'
57
105
  double fitted_weight(int position, int medium){
58
106
  double rel_pos = (double) abs(position - medium) / medium;
@@ -164,10 +212,17 @@ module RandomWalk
164
212
  end
165
213
 
166
214
  class << self
167
- alias score score_fitted_weight
168
- alias score_weights score_custom_weights
215
+ attr_accessor :scoring_method
216
+
217
+ def set_scoring(method)
218
+ scoring_method = method
219
+ class << self; self end.send(:alias_method, :score, method.to_sym)
220
+ end
169
221
  end
170
222
 
223
+ set_scoring :score_fitted_weight
224
+
225
+
171
226
  def self.combine(up, down)
172
227
  return down if up == 0
173
228
  return up if down == 0
@@ -180,6 +235,7 @@ module RandomWalk
180
235
  end
181
236
  end
182
237
 
238
+ # Two sided
183
239
  def self.score_up_down(up, down, total, missing = 0)
184
240
  scores_up = score(up, total, missing)
185
241
  scores_down = score(down, total, missing)
@@ -187,18 +243,34 @@ module RandomWalk
187
243
  combine(scores_up, scores_down)
188
244
  end
189
245
 
190
- # Two sided
191
- def self.permutations(size, total, missing = 0, times = 10000)
246
+ def self.permutations(size, total, missing = 0, times = 10_000)
192
247
  if size == 0
193
248
  [0] * times
194
249
  else
195
250
  (1..times).collect do
196
- p = Misc.random_sample_in_range(total, size)
197
- score(p.sort, total, missing).abs
251
+ p = []
252
+ sample_without_replacement(total, size, p)
253
+
254
+ score(p, total, missing).abs
198
255
  end
199
256
  end
200
257
  end
201
258
 
259
+ def self.persisted_permutations(size, total, missing = 0, times = 10_000)
260
+ repo_file = "/tmp/rw_repo5"
261
+ repo = Persist.open_tokyocabinet(repo_file, false, :float_array)
262
+ key = Misc.digest([size, total, missing, times, scoring_method].inspect)
263
+ if repo[key]
264
+ repo[key]
265
+ else
266
+ p = permutations(size, total, missing, times)
267
+ repo.write
268
+ repo[key] = p
269
+ repo.read
270
+ repo[key]
271
+ end
272
+ end
273
+
202
274
  def self.permutations_up_down(size_up, size_down, total, missing = 0, times = 10000)
203
275
  (1..times).collect do
204
276
  score_up_down(Array.new(size_up){ (rand * total).to_i }.sort, Array.new(size_down){ (rand * total).to_i }.sort, total, missing).abs
@@ -207,7 +279,8 @@ module RandomWalk
207
279
 
208
280
  def self.pvalue(permutations, score)
209
281
  score = score.abs
210
- permutations.inject(0){|acc, per|
282
+ permutations.inject(1){|acc, per|
283
+
211
284
  acc += 1 if per > score
212
285
  acc
213
286
  }.to_f / permutations.length
@@ -322,37 +395,44 @@ module OrderedList
322
395
  def pvalue(set, cutoff = 0.1, options = {})
323
396
  set = Set.new(set.compact) unless Set === set
324
397
  options = Misc.add_defaults options, :permutations => 10000, :missing => 0
325
- permutations, missing = Misc.process_options options, :permutations, :missing
398
+ permutations, missing, persist_permutations = Misc.process_options options, :permutations, :missing, :persist_permutations
326
399
 
327
400
  hits = hits(set)
328
-
401
+
329
402
  return 1.0 if hits.empty?
330
403
 
331
- target_score = RandomWalk.score(hits.sort, self.length, 0)
332
- target_score_abs = target_score.abs
333
-
334
- max = (permutations.to_f * cutoff).ceil
404
+ target_score = RandomWalk.score(hits.sort, self.length, missing)
335
405
 
336
- size = set.length
337
- total = self.length
338
- better_permutation_score_count = 1
339
- if size == 0
340
- 1.0
406
+ if persist_permutations
407
+ permutations = RandomWalk.persisted_permutations(set.length, self.length, missing, permutations)
408
+ RandomWalk.pvalue(permutations, target_score)
341
409
  else
342
- (1..permutations).each do
343
- p= []
344
- RandomWalk.sample_without_replacement(total, size, p)
345
-
346
- permutation_score = RandomWalk.score(p.sort, total, missing).abs
347
- if permutation_score.abs > target_score_abs
348
- better_permutation_score_count += 1
410
+ # P-value computation
411
+ target_score_abs = target_score.abs
412
+
413
+ max = (permutations.to_f * cutoff).ceil
414
+
415
+ size = set.length
416
+ total = self.length
417
+ better_permutation_score_count = 1
418
+ if size == 0
419
+ 1.0
420
+ else
421
+ (1..permutations).each do
422
+ p= []
423
+ RandomWalk.sample_without_replacement(total, size, p)
424
+
425
+ permutation_score = RandomWalk.score(p.sort, total, missing).abs
426
+ if permutation_score.abs > target_score_abs
427
+ better_permutation_score_count += 1
428
+ end
429
+
430
+ return 1.0 if better_permutation_score_count > max
349
431
  end
350
-
351
- return 1.0 if better_permutation_score_count > max
432
+ p = (better_permutation_score_count.to_f + 1) / permutations
433
+ p = -p if target_score < 0
434
+ p
352
435
  end
353
- p = better_permutation_score_count.to_f / permutations
354
- p = -p if target_score < 0
355
- p
356
436
  end
357
437
  end
358
438
 
@@ -390,7 +470,7 @@ module OrderedList
390
470
 
391
471
  return 1.0 if better_permutation_score_count > max
392
472
  end
393
- p = better_permutation_score_count.to_f / permutations
473
+ p = (better_permutation_score_count.to_f + 1) / permutations
394
474
  p = -p if target_score < 0
395
475
  p
396
476
  end
@@ -400,27 +480,30 @@ end
400
480
  module TSV
401
481
 
402
482
  def self.rank_enrichment_for_list(list, hits, options = {})
403
- cutoff = Misc.process_options options, :cutoff
483
+ cutoff = options[:cutoff]
404
484
  list.extend OrderedList
405
485
  if cutoff
406
486
  list.pvalue(hits, cutoff, options)
407
487
  else
408
- list.pvalue(hits, options)
488
+ list.pvalue(hits, nil, options)
409
489
  end
410
490
  end
411
491
 
412
492
  def self.rank_enrichment(tsv, list, options = {})
493
+ masked = options[:masked]
413
494
  if tsv.fields
414
495
  res = TSV.setup({}, :cast => :to_f, :type => :double, :key_field => tsv.key_field, :fields => ["p-value", tsv.fields.first])
415
496
  else
416
497
  res = TSV.setup({}, :cast => :to_f, :type => :double)
417
498
  end
418
499
 
419
- tsv.with_monitor do
500
+ list = list.clean_annotations if list.respond_to? :clean_annotations
501
+ tsv.with_monitor :desc => "Rank enrichment" do
420
502
  tsv.with_unnamed do
421
503
  tsv.through do |key, values|
422
- pvalue = rank_enrichment_for_list(list, values, options)
423
- res[key] = [pvalue, (values.respond_to?(:subset) ? values.subset(list) : values - list)]
504
+ next if masked and masked.include? key
505
+ pvalue = rank_enrichment_for_list(list, values.flatten, options)
506
+ res[key] = [pvalue, (values.respond_to?(:subset) ? values.subset(list) : values & list)]
424
507
  end
425
508
  end
426
509
  end
@@ -433,4 +516,17 @@ module TSV
433
516
  def rank_enrichment(list, options = {})
434
517
  TSV.rank_enrichment(self, list, options)
435
518
  end
519
+
520
+ def ranks_for(field)
521
+ ranks = TSV.setup({}, :key_field => self.key_field, :fields => ["Rank"], :type => :single, :cast => :to_i)
522
+ sort_by(field, true).each_with_index do |k, i|
523
+ ranks[k] = i
524
+ end
525
+
526
+ ranks.entity_options = entity_options
527
+ ranks.entity_templates = entity_templates
528
+ ranks.namespace = namespace
529
+
530
+ ranks
531
+ end
436
532
  end
@@ -5,8 +5,8 @@ module RankProduct
5
5
  scores = {}
6
6
  log_sizes = signature_sizes.collect{|size| Math::log(size)}
7
7
  gene_ranks.each{|gene, positions|
8
- scores[gene] = positions.zip(log_sizes).
9
- collect{|p| Math::log(p[0]) - p[1]}. # Take log and substract from size (normalize)
8
+ scores[gene] = positions.collect{|p| p.nil? or (p.respond_to?(:empty?) and p.empty?) ? signature_sizes.max : p }.zip(log_sizes).
9
+ collect{|p| Math::log(p[0]) - p[1]}.
10
10
  inject(0){|acc, v| acc += v }
11
11
  }
12
12
  scores
@@ -46,7 +46,7 @@ module TSV
46
46
  if block_given?
47
47
  scores = fields.collect{|field| tsv.sort_by(field, true, &block)}
48
48
  else
49
- scores = fields.collect{|field| tsv.sort_by(field, true){|gene,values| tsv.type == :double ? values.first.to_f : value.to_f}}
49
+ scores = fields.collect{|field| tsv.sort_by(field, true){|gene,values| tsv.type == :single ? values.to_f : values.flatten.first.to_f}}
50
50
  end
51
51
  positions = {}
52
52
 
@@ -61,7 +61,10 @@ module TSV
61
61
  end
62
62
  end
63
63
 
64
- score = RankProduct.score(positions, fields.collect{ tsv.size })
64
+ signature_sizes = fields.collect{|field| slice(field).values.select{|v| v and not (v.respond_to?(:empty?) and v.empty?)}.length}
65
+
66
+ #score = RankProduct.score(positions, fields.collect{ tsv.size })
67
+ score = RankProduct.score(positions, signature_sizes)
65
68
 
66
69
  score
67
70
  end
@@ -58,7 +58,6 @@ row7 A B Id3
58
58
  tsv = TSV.open(filename, :sep => /\s+/)
59
59
 
60
60
  assert_equal %w(a), tsv.enrichment(%w(row1 row3 row4 row5), "ValueA", :fdr => false, :background => %w(row1 row2 row3 row4 row5 row6 row7)).collect{|annot, values| pvalue = values.first.first.to_f; pvalue < 0.05 ? annot : nil}.compact
61
- ddd tsv.enrichment(%w(row3 row4 row5), "ValueA", :fdr => false, :background => %w(row1 row2 row3 row4 row5))
62
61
  assert_equal %w(), tsv.enrichment(%w(row1 row3 row4 row5), "ValueA", :fdr => false, :background => %w(row1 row2 row3 row4 row5)).collect{|annot, values| pvalue = values.first.first.to_f; pvalue < 0.05 ? annot : nil}.compact
63
62
  end
64
63
 
metadata CHANGED
@@ -1,20 +1,18 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: rbbt-dm
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.0.0
5
- prerelease:
4
+ version: 1.1.0
6
5
  platform: ruby
7
6
  authors:
8
7
  - Miguel Vazquez
9
8
  autorequire:
10
9
  bindir: bin
11
10
  cert_chain: []
12
- date: 2012-12-21 00:00:00.000000000 Z
11
+ date: 2013-10-21 00:00:00.000000000 Z
13
12
  dependencies:
14
13
  - !ruby/object:Gem::Dependency
15
14
  name: rbbt-util
16
15
  requirement: !ruby/object:Gem::Requirement
17
- none: false
18
16
  requirements:
19
17
  - - ! '>='
20
18
  - !ruby/object:Gem::Version
@@ -22,7 +20,6 @@ dependencies:
22
20
  type: :runtime
23
21
  prerelease: false
24
22
  version_requirements: !ruby/object:Gem::Requirement
25
- none: false
26
23
  requirements:
27
24
  - - ! '>='
28
25
  - !ruby/object:Gem::Version
@@ -30,7 +27,6 @@ dependencies:
30
27
  - !ruby/object:Gem::Dependency
31
28
  name: RubyInline
32
29
  requirement: !ruby/object:Gem::Requirement
33
- none: false
34
30
  requirements:
35
31
  - - ! '>='
36
32
  - !ruby/object:Gem::Version
@@ -38,7 +34,6 @@ dependencies:
38
34
  type: :runtime
39
35
  prerelease: false
40
36
  version_requirements: !ruby/object:Gem::Requirement
41
- none: false
42
37
  requirements:
43
38
  - - ! '>='
44
39
  - !ruby/object:Gem::Version
@@ -46,7 +41,6 @@ dependencies:
46
41
  - !ruby/object:Gem::Dependency
47
42
  name: priority_queue
48
43
  requirement: !ruby/object:Gem::Requirement
49
- none: false
50
44
  requirements:
51
45
  - - ! '>='
52
46
  - !ruby/object:Gem::Version
@@ -54,7 +48,6 @@ dependencies:
54
48
  type: :runtime
55
49
  prerelease: false
56
50
  version_requirements: !ruby/object:Gem::Requirement
57
- none: false
58
51
  requirements:
59
52
  - - ! '>='
60
53
  - !ruby/object:Gem::Version
@@ -62,7 +55,6 @@ dependencies:
62
55
  - !ruby/object:Gem::Dependency
63
56
  name: distribution
64
57
  requirement: !ruby/object:Gem::Requirement
65
- none: false
66
58
  requirements:
67
59
  - - ! '>='
68
60
  - !ruby/object:Gem::Version
@@ -70,7 +62,6 @@ dependencies:
70
62
  type: :runtime
71
63
  prerelease: false
72
64
  version_requirements: !ruby/object:Gem::Requirement
73
- none: false
74
65
  requirements:
75
66
  - - ! '>='
76
67
  - !ruby/object:Gem::Version
@@ -78,7 +69,6 @@ dependencies:
78
69
  - !ruby/object:Gem::Dependency
79
70
  name: png
80
71
  requirement: !ruby/object:Gem::Requirement
81
- none: false
82
72
  requirements:
83
73
  - - ! '>='
84
74
  - !ruby/object:Gem::Version
@@ -86,7 +76,6 @@ dependencies:
86
76
  type: :runtime
87
77
  prerelease: false
88
78
  version_requirements: !ruby/object:Gem::Requirement
89
- none: false
90
79
  requirements:
91
80
  - - ! '>='
92
81
  - !ruby/object:Gem::Version
@@ -99,6 +88,9 @@ extra_rdoc_files:
99
88
  - LICENSE
100
89
  files:
101
90
  - LICENSE
91
+ - lib/rbbt/expression/expression.rb
92
+ - lib/rbbt/expression/matrix.rb
93
+ - lib/rbbt/expression/signature.rb
102
94
  - lib/rbbt/network/paths.rb
103
95
  - lib/rbbt/plots/bar.rb
104
96
  - lib/rbbt/plots/heatmap.rb
@@ -108,42 +100,41 @@ files:
108
100
  - lib/rbbt/statistics/rank_product.rb
109
101
  - lib/rbbt/vector/model.rb
110
102
  - lib/rbbt/vector/model/svm.rb
111
- - test/rbbt/statistics/test_fdr.rb
112
103
  - test/rbbt/statistics/test_hypergeometric.rb
113
104
  - test/rbbt/statistics/test_random_walk.rb
105
+ - test/rbbt/statistics/test_fdr.rb
106
+ - test/rbbt/network/test_paths.rb
114
107
  - test/rbbt/vector/test_model.rb
115
108
  - test/rbbt/vector/model/test_svm.rb
116
- - test/rbbt/network/test_paths.rb
117
109
  - test/test_helper.rb
118
110
  homepage: http://github.com/mikisvaz/rbbt-phgx
119
111
  licenses: []
112
+ metadata: {}
120
113
  post_install_message:
121
114
  rdoc_options: []
122
115
  require_paths:
123
116
  - lib
124
117
  required_ruby_version: !ruby/object:Gem::Requirement
125
- none: false
126
118
  requirements:
127
119
  - - ! '>='
128
120
  - !ruby/object:Gem::Version
129
121
  version: '0'
130
122
  required_rubygems_version: !ruby/object:Gem::Requirement
131
- none: false
132
123
  requirements:
133
124
  - - ! '>='
134
125
  - !ruby/object:Gem::Version
135
126
  version: '0'
136
127
  requirements: []
137
128
  rubyforge_project:
138
- rubygems_version: 1.8.24
129
+ rubygems_version: 2.0.3
139
130
  signing_key:
140
- specification_version: 3
131
+ specification_version: 4
141
132
  summary: Data-mining and statistics
142
133
  test_files:
143
- - test/rbbt/statistics/test_fdr.rb
144
134
  - test/rbbt/statistics/test_hypergeometric.rb
145
135
  - test/rbbt/statistics/test_random_walk.rb
136
+ - test/rbbt/statistics/test_fdr.rb
137
+ - test/rbbt/network/test_paths.rb
146
138
  - test/rbbt/vector/test_model.rb
147
139
  - test/rbbt/vector/model/test_svm.rb
148
- - test/rbbt/network/test_paths.rb
149
140
  - test/test_helper.rb