rbbt-dm 1.0.0 → 1.1.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml ADDED
@@ -0,0 +1,15 @@
1
+ ---
2
+ !binary "U0hBMQ==":
3
+ metadata.gz: !binary |-
4
+ ZGQzNzI5ZWRiMzBkMTRjNGIxZmUxYTEwMDAyZDEyZjA1NTI0NTI1NQ==
5
+ data.tar.gz: !binary |-
6
+ YTQwYjdlNzU0NGY4ODUzMTBjNDgzMjcxMTAwOWY1OTY2OGM2YjY0Ng==
7
+ !binary "U0hBNTEy":
8
+ metadata.gz: !binary |-
9
+ ZDFkMTZiMjMxYWE2NjYxZDFlYzFmNjZmMmQzZDhlNWQ5YzhlYzlhNDEzMmI0
10
+ OGIzYzJkNGFkMjYyMjdjYjVmMjNiNTk4ZTdhYTQzOGJkNjM2MjEwZWJhOTgz
11
+ YWViMDA3NTY2MDE2NTM0YjRhOWMwYjk2Y2RlMGMxMzU4MjQ2YjE=
12
+ data.tar.gz: !binary |-
13
+ MjhmNjc2ODQ5MTZhOWZjNmE0YTQxYzJkN2UzZWQyYjNhZmZkMGJjNjliYTc2
14
+ NjQwOTk0YjZiMjRmNjFkZjQzZGM3OTAwYmYxZWJhOTM1MTI5ZmNmNzVlNTE3
15
+ MTE4YThkZmE2ZjVkMjViNzY5ZWI0NmZiMmYwYTY4Y2Y2NDFjMzU=
data/LICENSE CHANGED
@@ -1,4 +1,4 @@
1
- Copyright (c) 2010-2011 Miguel Vázquez García
1
+ Copyright (c) 2010-2013 Miguel Vázquez García
2
2
 
3
3
  Permission is hereby granted, free of charge, to any person obtaining
4
4
  a copy of this software and associated documentation files (the
@@ -0,0 +1,99 @@
1
+ require 'rbbt'
2
+ require 'rbbt/tsv'
3
+ require 'rbbt/GE'
4
+
5
+ module Expression
6
+ extend Workflow
7
+
8
+ def self.load_matrix(data_file, identifier_file, identifier_format, organism)
9
+ log :open_data, "Opening data file"
10
+ data = TSV.open(data_file, :type => :double, :unnamed => true)
11
+
12
+ organism ||= data.namespace
13
+
14
+ if not (identifier_file.nil? or identifier_format.nil? or data.key_field == identifier_format)
15
+
16
+ case
17
+ when (fields = (TSV.parse_header(Open.open(identifier_file)).fields) and fields.include?(identifier_format))
18
+ log :attach, "Adding #{ identifier_format } from #{ identifier_file }"
19
+ data = data.attach identifier_file, :fields => [identifier_format]
20
+ log :reorder, "Reordering data fields"
21
+ data = data.reorder identifier_format, data.fields.dup.delete_if{|field| field == identifier_format}
22
+ else
23
+ raise "No organism defined and identifier_format did not match available formats" if organism.nil?
24
+ require 'rbbt/sources/organism'
25
+ organism_identifiers = Organism.identifiers(organism)
26
+ data.identifiers = identifier_file
27
+ log :attach, "Adding #{ identifier_format } from #{ organism_identifiers }"
28
+ data = data.attach organism_identifiers, :fields => [identifier_format]
29
+ log :reorder, "Reordering data fields"
30
+ data = data.reorder identifier_format, data.fields.dup.delete_if{|field| field == identifier_format}
31
+ data
32
+ end
33
+
34
+ new_data = TSV.setup({}, :key_field => data.key_field, :fields => data.fields, :type => :list, :cast => :to_f, :namespace => organism, :unnamed => true)
35
+ log :averaging, "Averaging multiple values"
36
+ data.with_unnamed do
37
+ data.through do |key, values|
38
+ new_data[key] = values.collect{|list| Misc.mean(list.collect{|v| v.to_f})}
39
+ end
40
+ end
41
+
42
+ data = new_data
43
+ else
44
+ log :ready, "Matrix ready"
45
+ end
46
+
47
+ data
48
+ end
49
+
50
+ def self.average_samples(matrix_file, samples)
51
+ matrix = TSV.open(matrix_file)
52
+ new = TSV.setup({}, :key_field => matrix.key_field, :fields => matrix.fields, :cast => matrix.cast, :namespace => matrix.namespace)
53
+ positions = samples.collect{|sample| matrix.identify_field sample}.compact
54
+ matrix.with_unnamed do
55
+ matrix.through do |key,values|
56
+ new[key] = Misc.mean(values.values_at(*positions).compact)
57
+ end
58
+ end
59
+
60
+ new
61
+ end
62
+
63
+ def self.differential(matrix_file, main, contrast, log2, two_channel)
64
+ header = TSV.parse_header(Open.open(matrix_file))
65
+ key_field, *fields = header.all_fields
66
+ namespace = header.namespace
67
+
68
+ main = main & fields
69
+ contrast = contrast & fields
70
+
71
+ if Step === self
72
+ GE.analyze(matrix_file, main, contrast, log2, path, key_field, two_channel)
73
+ TSV.open(path, :type => :list, :cast => :to_f, :namespace => namespace)
74
+ else
75
+ TmpFile.with_file do |path|
76
+ GE.analyze(matrix_file, main, contrast, log2, path, key_field, two_channel)
77
+ TSV.open(path, :type => :list, :cast => :to_f, :namespace => namespace)
78
+ end
79
+ end
80
+ end
81
+
82
+ def self.barcode(matrix_file, output_file, factor = 3)
83
+ GE.barcode(matrix_file, output_file, factor)
84
+ end
85
+
86
+ def self.top_up(diff_file, cutoff = 0.05)
87
+ TSV.open(diff_file, :cast => :to_f).select("adjusted.p.values"){|p| p > 0 and p < cutoff}
88
+ end
89
+
90
+ def self.top_down(diff_file, cutoff = 0.05)
91
+ cutoff = -cutoff
92
+ tsv = TSV.open(diff_file, :cast => :to_f).select("adjusted.p.values"){|p| p < 0 and p > cutoff}
93
+ tsv.each do |key,values|
94
+ tsv[key] = values.collect{|v| v.abs}
95
+ end
96
+ tsv
97
+ end
98
+ end
99
+
@@ -0,0 +1,164 @@
1
+ require 'rbbt'
2
+ require 'rbbt/util/misc'
3
+ require 'rbbt/persist'
4
+ require 'rbbt/workflow'
5
+ require 'rbbt/GE/GEO'
6
+ require 'rbbt/statistics/fdr'
7
+ require 'rbbt/expression/expression'
8
+ require 'rbbt/expression/signature'
9
+
10
+ class Matrix
11
+ extend Resource
12
+ self.subdir = "var/matrices"
13
+ MATRIX_DIR = Matrix.root.find
14
+
15
+ def self.geo_matrix_for(gds, key_field = nil, organism = nil)
16
+ data = GEO[gds].values.produce.find
17
+ samples = GEO[gds].samples.produce.find
18
+
19
+ dataset_info = GEO[gds]['info.yaml'].produce.yaml
20
+ platform = dataset_info[:platform]
21
+ identifiers = GEO[platform].codes.produce.find
22
+
23
+ log2 = ["count"].include? dataset_info[:value_type]
24
+
25
+ Matrix.new(data, identifiers, samples, key_field, organism, log2)
26
+ end
27
+
28
+ attr_accessor :data, :identifiers, :labels, :key_field, :organism, :samples, :log2, :channel
29
+ def initialize(data, identifiers, labels = nil, key_field = nil, organism = nil, log2 = false, channel = false)
30
+ data.produce if data.respond_to? :produce
31
+ @data = data
32
+ @samples = TSV::Parser.new(Open.open(data)).fields
33
+ @identifiers = identifiers
34
+ @labels = TSV.open(labels) unless labels.nil?
35
+ @key_field = key_field
36
+ @log2 = log2
37
+ @channel = channel
38
+ @organism = organism
39
+ end
40
+
41
+ def matrix_file(path = nil)
42
+ path ||= Persist.persistence_path(data, {:dir => Matrix::MATRIX_DIR}, {:identifiers => identifiers, :labels => labels, :key_field => key_field, :organism => organism})
43
+ Persist.persist(data, :tsv, :file => path, :check => [data], :no_load => true) do
44
+ matrix = Expression.load_matrix(data, identifiers, key_field, organism)
45
+ matrix = matrix.select(:key => Organism.sanctioned_genes(organism).list) if matrix.key_field == "Ensembl Gene ID"
46
+ matrix
47
+ end
48
+ path
49
+ end
50
+
51
+ def average_samples(samples)
52
+ path = Persist.persistence_path(matrix_file, {:dir => File.join(Matrix::MATRIX_DIR, 'averaged_samples')}, {:samples => samples})
53
+ Persist.persist(data, :tsv, :file => path, :no_load => true, :check => [matrix_file]) do
54
+ Expression.average_samples(matrix_file, samples)
55
+ end
56
+ path
57
+ end
58
+
59
+ def find_samples(value, field = nil)
60
+ labels.select(field){|k,v|
61
+ Array === v ? v.flatten.include?(value) : v == value
62
+ }.keys
63
+ end
64
+
65
+ def remove_missing(samples)
66
+ @samples & samples
67
+ end
68
+
69
+ def average_label(value, field = nil)
70
+ samples = find_samples(value, field)
71
+ samples = remove_missing(samples)
72
+ average_samples(samples)
73
+ end
74
+
75
+ def barcode(path = nil, factor = 2)
76
+ path ||= Persist.persistence_path(matrix_file, {:dir => File.join(Matrix::MATRIX_DIR, 'sample_differences')}, {:main => main, :contrast => contrast, :log2 => log2, :channel => channel})
77
+ Persist.persist(data, :tsv, :file => path, :no_load => true, :check => [matrix_file]) do
78
+ Expression.barcode(matrix_file, path, factor)
79
+ nil
80
+ end
81
+ path
82
+ end
83
+
84
+ def sample_differences(main, contrast)
85
+ path = Persist.persistence_path(matrix_file, {:dir => File.join(Matrix::MATRIX_DIR, 'sample_differences')}, {:main => main, :contrast => contrast, :log2 => log2, :channel => channel})
86
+ Persist.persist(data, :tsv, :file => path, :no_load => true, :check => [matrix_file]) do
87
+ Expression.differential(matrix_file, main, contrast, log2, channel)
88
+ end
89
+ path
90
+ end
91
+
92
+ def label_differences(main, contrast = nil, field = nil)
93
+ all_samples = labels.keys
94
+ main_samples = find_samples(main, field)
95
+ if contrast
96
+ contrast_samples = find_samples(contrast, field)
97
+ else
98
+ contrast_samples = all_samples - main_samples
99
+ end
100
+
101
+ main_samples = remove_missing(main_samples)
102
+ contrast_samples = remove_missing(contrast_samples)
103
+
104
+ sample_differences(main_samples, contrast_samples)
105
+ end
106
+
107
+ def signature_set(field, cast = nil)
108
+ path = Persist.persistence_path(matrix_file, {:dir => File.join(Matrix::MATRIX_DIR, 'signature_set')}, {:field => field, :cast => cast})
109
+ Persist.persist(data, :tsv, :file => path, :no_load => true, :check => [matrix_file]) do
110
+ signatures = TSV.open(matrix_file, :fields => [], :type => :list, :cast => cast)
111
+ labels.values.flatten.uniq.sort.each do |value|
112
+ begin
113
+ s = Signature.tsv_field(label_differences(value), field, cast)
114
+ s.fields = [value]
115
+ signatures.attach s
116
+ rescue Exception
117
+ Log.warn("Signature for #{ value } did not compute")
118
+ end
119
+ end
120
+ signatures
121
+ end
122
+ path
123
+ end
124
+
125
+ def random_forest_importance(main, contrast = nil, field = nil, options = {})
126
+ features = Misc.process_options options, :features
127
+ features ||= []
128
+
129
+ path = Persist.persistence_path(matrix_file, {:dir => File.join(Matrix::MATRIX_DIR, 'random_forest_importance')}, {:main => main, :contrast => contrast, :field => field, :features => features})
130
+ Persist.persist(data, :tsv, :file => path, :no_load => false, :check => [matrix_file]) do
131
+ all_samples = labels.keys
132
+ main_samples = find_samples(main, field)
133
+ if contrast
134
+ contrast_samples = find_samples(contrast, field)
135
+ else
136
+ contrast_samples = all_samples - main_samples
137
+ end
138
+
139
+
140
+ main_samples = remove_missing(main_samples)
141
+ contrast_samples = remove_missing(contrast_samples)
142
+
143
+ TmpFile.with_file do |result|
144
+ R.run <<-EOF
145
+ library(randomForest);
146
+ orig = rbbt.tsv('#{matrix_file}');
147
+ main = c('#{main_samples * "', '"}')
148
+ contrast = c('#{contrast_samples * "', '"}')
149
+ features = c('#{features * "', '"}')
150
+
151
+ features = intersect(features, rownames(orig));
152
+ data = t(orig[features, c(main, contrast)])
153
+ data = cbind(data, Class = 0)
154
+ data[main, "Class"] = 1
155
+
156
+ rf = randomForest(factor(Class) ~ ., data, na.action = na.exclude)
157
+ rbbt.tsv.write(rf$importance, filename='#{ result }', key.field = '#{@key_field}')
158
+ EOF
159
+
160
+ TSV.open(result, :type => :single, :cast => :to_f)
161
+ end
162
+ end
163
+ end
164
+ end
@@ -0,0 +1,151 @@
1
+ require 'rbbt/util/misc'
2
+
3
+ module Signature
4
+
5
+ def self.setup(hash, options = {})
6
+ hash.extend Signature
7
+ hash
8
+ end
9
+
10
+ def self.open(file, field = nil, options = {})
11
+ options = Misc.add_defaults options, :fields => nil, :cast => :to_f, :type => :single
12
+
13
+ options[:fields] ||= [field] if field
14
+
15
+ tsv = TSV.open(file, options)
16
+ tsv.extend Signature
17
+ tsv
18
+ end
19
+
20
+ def self.tsv_field(tsv, field, cast = nil)
21
+ tsv = TSV.open(tsv) unless TSV === tsv
22
+ Signature.setup(tsv.column(field, cast))
23
+ end
24
+
25
+ #{{{ Basic manipulation
26
+
27
+ def select(*args, &block)
28
+ Signature.setup(super(*args, &block))
29
+ end
30
+
31
+ def transform(&block)
32
+ case
33
+ when (block_given? and block.arity == 2)
34
+ self.each do |key, value|
35
+ self[key] = yield key, value
36
+ end
37
+ when (block_given? and block.arity == 1)
38
+ self.each do |key, value|
39
+ self[key] = yield value
40
+ end
41
+ else
42
+ raise "Block not given, or arity not 1 or 2"
43
+ end
44
+ self
45
+ end
46
+
47
+ def abs
48
+ transform{|value| value.abs}
49
+ end
50
+
51
+ def log
52
+ transform{|value| Math.log(value)}
53
+ end
54
+
55
+ def values_over(threshold)
56
+ entity_options = self.entity_options
57
+ entity_options[:organism] ||= self.namespace
58
+ Misc.prepare_entity(self.select{|k,v| v >= threshold}.collect{|k,v| k}, self.key_field, entity_options)
59
+ end
60
+
61
+ def values_under(threshold)
62
+ entity_options = self.entity_options
63
+ entity_options[:organism] ||= self.namespace
64
+ Misc.prepare_entity(self.select{|k,v| v <= threshold}.collect{|k,v| k}, self.key_field, entity_options)
65
+ end
66
+
67
+ #{{{ Rank stuff
68
+
69
+ def clean_empty
70
+ Signature.setup(select{|k,v| v.nil? ? false : (v.respond_to?(:empty) ? !v.empty? : true)}.tap{|s| s.unnamed = true})
71
+ end
72
+
73
+ def sorted
74
+ OrderedList.setup(clean_empty.sort_by{|elem,v| v}.collect{|elem,v| elem})
75
+ end
76
+
77
+ def ranks
78
+ ranks = TSV.setup({}, :key_field => self.key_field, :fields => ["Rank"], :cast => :to_i, :type => :single)
79
+ sorted.each_with_index do |elem, i|
80
+ ranks[elem] = i
81
+ end
82
+ ranks
83
+ end
84
+
85
+ #{{{ Pvalue stuff
86
+
87
+ def significant_pvalues(threshold)
88
+ entity_options = self.entity_options
89
+ entity_options[:organism] ||= self.namespace
90
+ if threshold > 0
91
+ Misc.prepare_entity(self.select{|k,v| v > 0 and v <= threshold}.collect{|k,v| k}, self.key_field, entity_options)
92
+ else
93
+ Misc.prepare_entity(self.select{|k,v| v < 0 and v >= threshold}.collect{|k,v| k}, self.key_field, entity_options)
94
+ end
95
+ end
96
+ def pvalue_fdr_adjust!
97
+ FDR.adjust_hash! self
98
+ self
99
+ end
100
+
101
+ def pvalue_score
102
+ transform{|value| value > 0 ? -Math.log(value + 0.00000001) : Math.log(-value + 0.00000001)}
103
+ end
104
+
105
+ def pvalue_sorted
106
+ OrderedList.setup(clean_empty.transform{|v| v.to_f}.sort{|a,b|
107
+ a = a[1]
108
+ b = b[1]
109
+ case
110
+ when a == b
111
+ 0
112
+ when (a <= 0 and b >= 0)
113
+ 1
114
+ when (a >= 0 and b <= 0)
115
+ -2
116
+ when a > 0
117
+ a.abs <=> b.abs
118
+ else
119
+ b.abs <=> a.abs
120
+ end
121
+
122
+ }.collect{|elem,v| elem})
123
+ end
124
+
125
+ def pvalue_sorted_weights
126
+ sorted = clean_empty.transform{|v| v.to_f}.sort{|a,b|
127
+ a = a[1]
128
+ b = b[1]
129
+ case
130
+ when a == b
131
+ 0
132
+ when (a <= 0 and b >= 0)
133
+ 1
134
+ when (a >= 0 and b <= 0)
135
+ -2
136
+ when a > 0
137
+ a.abs <=> b.abs
138
+ else
139
+ b.abs <=> a.abs
140
+ end
141
+ }
142
+
143
+ keys = []
144
+ weights = []
145
+ sorted.each{|k,v| keys << k; weights << - Math.log(v.abs)}
146
+
147
+ OrderedList.setup(keys, weights)
148
+ end
149
+
150
+
151
+ end
@@ -1,4 +1,5 @@
1
1
  require 'rbbt/util/R'
2
+ require 'rbbt/util/colorize'
2
3
 
3
4
  module Heatmap
4
5
  def self.heatmap(values, filename, options = {})
@@ -45,14 +46,71 @@ module Heatmap
45
46
  scale, take_log, add_to_height, colors = Misc.process_options options,
46
47
  :scale, :take_log, :add_to_height, :colors
47
48
 
48
- width = 200 + (values.fields.length * 16)
49
- height = 200 + (values.length * 16)
49
+ width = 1200 + (values.fields.length * 100)
50
+ height = 1000 + (values.length * 50)
50
51
  size = [width, height].max
51
52
  size = [size, 20000].min
53
+ width = [size, width].min
54
+ height = [size, height].min
55
+
56
+ take_log = take_log ? "TRUE" : "FALSE"
57
+ heatmap_script = <<-EOF
58
+ library(ggplot2);
59
+ rbbt.heatmap('#{filename}', #{ width }, #{ height }, data, take_log=#{take_log});
60
+ EOF
61
+
62
+ values.R heatmap_script
63
+
64
+ filename
65
+ end
66
+
67
+ def self.heatmap3(values, filename, options = {})
68
+ scale, take_log, add_to_height, colors = Misc.process_options options,
69
+ :scale, :take_log, :add_to_height, :colors
70
+
71
+ width = 1200 + (values.fields.length * 100)
72
+ height = 1000 + (values.length * 50)
73
+ size = [width, height].max
74
+ size = [size, 2000].min
75
+ width = [size, width].min
76
+ height = [size, height].min
77
+
78
+ take_log = take_log ? "TRUE" : "FALSE"
79
+
80
+ map = options.delete :map
81
+
82
+ if map
83
+ values = values.slice(map.keys)
84
+ clab = TSV.setup(map.keys, :type => :list, :fields => [], :key_field => map.key_field)
85
+
86
+ options[:keys] = []
87
+ options[:colors] = []
88
+ map.fields.each do |field|
89
+ color = Colorize.tsv map.slice(field)
90
+ clab.add_field field do |k, values|
91
+ color[k].to_rgb
92
+ end
93
+ options[:keys] << "" unless options[:keys].empty?
94
+ options[:keys].concat map.values.uniq
95
+ options[:colors] << "#000" unless options[:colors].empty?
96
+ options[:colors].concat color.values_at(*map.keys).collect{|c| c.to_rgb}.uniq
97
+ end
98
+
99
+ if options[:keys].length > 20
100
+ options.delete :keys
101
+ options.delete :colors
102
+ end
103
+
104
+ options[:ColSideColors] = clab
105
+ end
106
+
107
+ other_params = ", " << options.collect{|k,v| [R.ruby2R(k), R.ruby2R(v)] * "="} * ", " if options.any?
52
108
 
53
109
  heatmap_script = <<-EOF
54
- library(ggplot2);
110
+ library(ggplot2, quietly = TRUE, warn.conflicts = TRUE);
111
+ source('#{Rbbt.share.R["heatmap.3.R"].find}');
55
112
 
113
+ rbbt.heatmap.3('#{filename}', #{ width }, #{ height }, data, take_log=#{take_log}#{other_params});
56
114
  EOF
57
115
 
58
116
  values.R heatmap_script
@@ -60,4 +118,5 @@ module Heatmap
60
118
  filename
61
119
  end
62
120
 
121
+
63
122
  end
@@ -1,7 +1,9 @@
1
1
  require 'inline'
2
2
  require 'rsruby'
3
- require 'rbbt/tsv'
3
+ require 'rbbt'
4
+ require 'rbbt/persist'
4
5
  require 'rbbt/persist'
6
+ require 'rbbt/tsv'
5
7
  require 'rbbt/statistics/fdr'
6
8
  require 'rbbt/entity'
7
9
  require 'distribution'
@@ -111,52 +113,43 @@ module TSV
111
113
  fields = [fields] if String === fields or Symbol === fields
112
114
  rename = options.delete :rename
113
115
 
114
- Persist.persist(filename, :yaml, :fields => fields, :persist => persistence, :prefix => "Hyp.Geo.Counts", :other => { :rename => rename }) do
115
- data ||= Hash.new(0)
116
+ persistence_path = self.respond_to?(:persistence_path)? self.persistence_path : nil
117
+ Persist.persist(filename, :yaml, :fields => fields, :persist => persistence, :prefix => "Hyp.Geo.Counts", :other => {:rename => rename, :persistence_path => persistence_path}) do
118
+ data ||= {}
116
119
 
117
120
  with_unnamed do
118
121
 
119
122
  case type
120
123
  when :single
121
124
  through :key, fields do |key, value|
122
- next if value.nil?
123
- data[value] += 1
125
+ next if value.nil?
126
+ data[value] ||= []
127
+ data[value] << key
124
128
  end
125
129
  when :double
126
- if rename
127
- Log.debug("Computing annotation counts with rename: #{rename.values.flatten.compact.uniq.sort * ", "} ")
128
- through :key, fields do |key, values|
129
- next if values.nil?
130
- values.flatten.collect{|elem| rename.include?(elem)? rename[elem] : elem }.compact.uniq.each{|value| data[value] += 1 }
131
- end
132
- else
133
- through :key, fields do |key, values|
134
- values.flatten.compact.uniq.each{|value| data[value] += 1}
135
- end
130
+ through :key, fields do |key, values|
131
+ values.flatten.compact.uniq.each{|value| data[value] ||= []; data[value] << key}
136
132
  end
137
133
  when :list
138
134
  through :key, fields do |key, values|
139
135
  next if values.nil?
140
- values.compact.uniq.each{|value| data[value] += 1}
136
+ values.compact.uniq.each{|value| data[value] ||= []; data[value] << key}
141
137
  end
142
138
  when :flat
143
- if rename
144
- Log.debug("Computing annotation counts with rename: #{rename.values.flatten.compact.uniq.sort * ", "} ")
145
- through :key, fields do |key, values|
146
- next if values.nil?
147
- values.collect{|elem| rename.include?(elem)? rename[elem] : elem }.compact.uniq.each{|value| data[value] += 1 }
148
- end
149
- else
150
- through :key, fields do |key, values|
151
- next if values.nil?
152
- values.compact.uniq.each{|value| data[value] += 1}
153
- end
139
+ through :key, fields do |key, values|
140
+ next if values.nil?
141
+ values.compact.uniq.each{|value| data[value] ||= []; data[value] << key}
154
142
  end
155
143
  end
156
144
 
157
145
  end
158
146
 
159
- data
147
+ if rename
148
+ Log.debug("Using renames during annotation counts")
149
+ Hash[*data.keys.zip(data.values.collect{|l| l.collect{|e| rename.include?(e)? rename[e] : e }.uniq.length }).flatten]
150
+ else
151
+ Hash[*data.keys.zip(data.values.collect{|l| l.uniq.length}).flatten]
152
+ end
160
153
  end
161
154
  end
162
155
 
@@ -164,6 +157,8 @@ module TSV
164
157
  options = Misc.add_defaults options, :skip_missing => true, :background => nil
165
158
  background, skip_missing = Misc.process_options options, :background, :skip_missing
166
159
 
160
+ list = list.compact.uniq
161
+
167
162
  if Array === background and not background.empty?
168
163
  filter
169
164
  add_filter(:key, background)
@@ -174,13 +169,11 @@ module TSV
174
169
  end
175
170
  end
176
171
 
177
- list = list.compact.uniq
178
-
179
172
  with_unnamed do
180
173
  fields ||= self.fields.first
181
174
  options = Misc.add_defaults options, :min_support => 3, :fdr => true, :cutoff => false, :add_keys => true
182
175
 
183
- add_keys, rename = Misc.process_options options, :add_keys, :rename
176
+ add_keys, rename, masked = Misc.process_options options, :add_keys, :rename, :masked
184
177
 
185
178
  Log.debug "Enrichment analysis of field #{fields.inspect} for #{list.length} entities"
186
179
 
@@ -194,11 +187,11 @@ module TSV
194
187
  total = found
195
188
  Log.debug "Using #{ found } as sample size; skipping missing"
196
189
  else
197
- total = list.uniq.length
190
+ total = list.length
198
191
  Log.debug "Using #{ list.length } as sample size"
199
192
  end
200
193
 
201
- counts = annotation_counts fields, options[:persist], :rename => rename
194
+ counts = annotation_counts fields, options[:persist], :rename => rename, :masked => masked
202
195
 
203
196
  annotation_keys = Hash.new
204
197
  selected.with_unnamed do
@@ -230,14 +223,14 @@ module TSV
230
223
  end
231
224
 
232
225
  when :flat
233
- selected.through :key, fields do |key, values|
226
+ selected.through do |key, values|
227
+ next if values.nil?
234
228
  values.compact.uniq.reject{|value| value.empty?}.each{|value|
235
229
  value = value.dup
236
230
  annotation_keys[value] ||= []
237
231
  annotation_keys[value] << key
238
232
  }
239
233
  end
240
-
241
234
  end
242
235
 
243
236
  end
@@ -249,6 +242,7 @@ module TSV
249
242
 
250
243
  pvalues = {}
251
244
  annotation_keys.each do |annotation, elems|
245
+ next if masked and masked.include? annotation
252
246
  elems = elems.collect{|elem| rename.include?(elem)? rename[elem] : elem }.compact.uniq if rename
253
247
  count = elems.length
254
248
  next if count < options[:min_support] or not counts.include? annotation
@@ -53,6 +53,54 @@ module RandomWalk
53
53
  }
54
54
  EOC
55
55
 
56
+ builder.c_singleton <<-'EOC'
57
+ double score_plain_weight(VALUE positions, int total, int missing){
58
+ int idx;
59
+
60
+ int position;
61
+ double penalty;
62
+ double max_top, max_bottom;
63
+ double hit_weights = 0;
64
+
65
+ VALUE rel_l = rb_ary_new();
66
+ VALUE rel_q = rb_ary_new();
67
+
68
+ rb_ary_push(rel_q,rb_float_new(0));
69
+
70
+ // Rescale positions and accumulate weights
71
+
72
+ for (idx = 0; idx < RARRAY_LEN(positions); idx++){
73
+ position = FIX2INT(rb_ary_entry(positions, idx));
74
+
75
+ rb_ary_push(rel_l, rb_float_new((double) position / total));
76
+
77
+ hit_weights += 1;
78
+ rb_ary_push(rel_q, rb_float_new(hit_weights));
79
+ }
80
+
81
+ // Add penalty for missing genes
82
+ penalty = missing * 1;
83
+ hit_weights = hit_weights + penalty;
84
+
85
+ // Traverse list and get extreme values of:
86
+ // Proportion of weight covered - Proportion of hits covered
87
+
88
+ max_top = max_bottom = 0;
89
+ for (idx = 0; idx < RARRAY_LEN(positions); idx++){
90
+ double top = RFLOAT_VALUE(rb_ary_entry(rel_q, idx + 1)) / hit_weights -
91
+ RFLOAT_VALUE(rb_ary_entry(rel_l, idx));
92
+ double bottom = - (penalty + RFLOAT_VALUE(rb_ary_entry(rel_q, idx))) / hit_weights +
93
+ RFLOAT_VALUE(rb_ary_entry(rel_l, idx));
94
+
95
+ if (top > max_top) max_top = top;
96
+ if (bottom > max_bottom) max_bottom = bottom;
97
+ }
98
+
99
+ if (max_top > max_bottom) return max_top;
100
+ else return -max_bottom;
101
+ }
102
+ EOC
103
+
56
104
  builder.c_raw_singleton <<-'EOC'
57
105
  double fitted_weight(int position, int medium){
58
106
  double rel_pos = (double) abs(position - medium) / medium;
@@ -164,10 +212,17 @@ module RandomWalk
164
212
  end
165
213
 
166
214
  class << self
167
- alias score score_fitted_weight
168
- alias score_weights score_custom_weights
215
+ attr_accessor :scoring_method
216
+
217
+ def set_scoring(method)
218
+ scoring_method = method
219
+ class << self; self end.send(:alias_method, :score, method.to_sym)
220
+ end
169
221
  end
170
222
 
223
+ set_scoring :score_fitted_weight
224
+
225
+
171
226
  def self.combine(up, down)
172
227
  return down if up == 0
173
228
  return up if down == 0
@@ -180,6 +235,7 @@ module RandomWalk
180
235
  end
181
236
  end
182
237
 
238
+ # Two sided
183
239
  def self.score_up_down(up, down, total, missing = 0)
184
240
  scores_up = score(up, total, missing)
185
241
  scores_down = score(down, total, missing)
@@ -187,18 +243,34 @@ module RandomWalk
187
243
  combine(scores_up, scores_down)
188
244
  end
189
245
 
190
- # Two sided
191
- def self.permutations(size, total, missing = 0, times = 10000)
246
+ def self.permutations(size, total, missing = 0, times = 10_000)
192
247
  if size == 0
193
248
  [0] * times
194
249
  else
195
250
  (1..times).collect do
196
- p = Misc.random_sample_in_range(total, size)
197
- score(p.sort, total, missing).abs
251
+ p = []
252
+ sample_without_replacement(total, size, p)
253
+
254
+ score(p, total, missing).abs
198
255
  end
199
256
  end
200
257
  end
201
258
 
259
+ def self.persisted_permutations(size, total, missing = 0, times = 10_000)
260
+ repo_file = "/tmp/rw_repo5"
261
+ repo = Persist.open_tokyocabinet(repo_file, false, :float_array)
262
+ key = Misc.digest([size, total, missing, times, scoring_method].inspect)
263
+ if repo[key]
264
+ repo[key]
265
+ else
266
+ p = permutations(size, total, missing, times)
267
+ repo.write
268
+ repo[key] = p
269
+ repo.read
270
+ repo[key]
271
+ end
272
+ end
273
+
202
274
  def self.permutations_up_down(size_up, size_down, total, missing = 0, times = 10000)
203
275
  (1..times).collect do
204
276
  score_up_down(Array.new(size_up){ (rand * total).to_i }.sort, Array.new(size_down){ (rand * total).to_i }.sort, total, missing).abs
@@ -207,7 +279,8 @@ module RandomWalk
207
279
 
208
280
  def self.pvalue(permutations, score)
209
281
  score = score.abs
210
- permutations.inject(0){|acc, per|
282
+ permutations.inject(1){|acc, per|
283
+
211
284
  acc += 1 if per > score
212
285
  acc
213
286
  }.to_f / permutations.length
@@ -322,37 +395,44 @@ module OrderedList
322
395
  def pvalue(set, cutoff = 0.1, options = {})
323
396
  set = Set.new(set.compact) unless Set === set
324
397
  options = Misc.add_defaults options, :permutations => 10000, :missing => 0
325
- permutations, missing = Misc.process_options options, :permutations, :missing
398
+ permutations, missing, persist_permutations = Misc.process_options options, :permutations, :missing, :persist_permutations
326
399
 
327
400
  hits = hits(set)
328
-
401
+
329
402
  return 1.0 if hits.empty?
330
403
 
331
- target_score = RandomWalk.score(hits.sort, self.length, 0)
332
- target_score_abs = target_score.abs
333
-
334
- max = (permutations.to_f * cutoff).ceil
404
+ target_score = RandomWalk.score(hits.sort, self.length, missing)
335
405
 
336
- size = set.length
337
- total = self.length
338
- better_permutation_score_count = 1
339
- if size == 0
340
- 1.0
406
+ if persist_permutations
407
+ permutations = RandomWalk.persisted_permutations(set.length, self.length, missing, permutations)
408
+ RandomWalk.pvalue(permutations, target_score)
341
409
  else
342
- (1..permutations).each do
343
- p= []
344
- RandomWalk.sample_without_replacement(total, size, p)
345
-
346
- permutation_score = RandomWalk.score(p.sort, total, missing).abs
347
- if permutation_score.abs > target_score_abs
348
- better_permutation_score_count += 1
410
+ # P-value computation
411
+ target_score_abs = target_score.abs
412
+
413
+ max = (permutations.to_f * cutoff).ceil
414
+
415
+ size = set.length
416
+ total = self.length
417
+ better_permutation_score_count = 1
418
+ if size == 0
419
+ 1.0
420
+ else
421
+ (1..permutations).each do
422
+ p= []
423
+ RandomWalk.sample_without_replacement(total, size, p)
424
+
425
+ permutation_score = RandomWalk.score(p.sort, total, missing).abs
426
+ if permutation_score.abs > target_score_abs
427
+ better_permutation_score_count += 1
428
+ end
429
+
430
+ return 1.0 if better_permutation_score_count > max
349
431
  end
350
-
351
- return 1.0 if better_permutation_score_count > max
432
+ p = (better_permutation_score_count.to_f + 1) / permutations
433
+ p = -p if target_score < 0
434
+ p
352
435
  end
353
- p = better_permutation_score_count.to_f / permutations
354
- p = -p if target_score < 0
355
- p
356
436
  end
357
437
  end
358
438
 
@@ -390,7 +470,7 @@ module OrderedList
390
470
 
391
471
  return 1.0 if better_permutation_score_count > max
392
472
  end
393
- p = better_permutation_score_count.to_f / permutations
473
+ p = (better_permutation_score_count.to_f + 1) / permutations
394
474
  p = -p if target_score < 0
395
475
  p
396
476
  end
@@ -400,27 +480,30 @@ end
400
480
  module TSV
401
481
 
402
482
  def self.rank_enrichment_for_list(list, hits, options = {})
403
- cutoff = Misc.process_options options, :cutoff
483
+ cutoff = options[:cutoff]
404
484
  list.extend OrderedList
405
485
  if cutoff
406
486
  list.pvalue(hits, cutoff, options)
407
487
  else
408
- list.pvalue(hits, options)
488
+ list.pvalue(hits, nil, options)
409
489
  end
410
490
  end
411
491
 
412
492
  def self.rank_enrichment(tsv, list, options = {})
493
+ masked = options[:masked]
413
494
  if tsv.fields
414
495
  res = TSV.setup({}, :cast => :to_f, :type => :double, :key_field => tsv.key_field, :fields => ["p-value", tsv.fields.first])
415
496
  else
416
497
  res = TSV.setup({}, :cast => :to_f, :type => :double)
417
498
  end
418
499
 
419
- tsv.with_monitor do
500
+ list = list.clean_annotations if list.respond_to? :clean_annotations
501
+ tsv.with_monitor :desc => "Rank enrichment" do
420
502
  tsv.with_unnamed do
421
503
  tsv.through do |key, values|
422
- pvalue = rank_enrichment_for_list(list, values, options)
423
- res[key] = [pvalue, (values.respond_to?(:subset) ? values.subset(list) : values - list)]
504
+ next if masked and masked.include? key
505
+ pvalue = rank_enrichment_for_list(list, values.flatten, options)
506
+ res[key] = [pvalue, (values.respond_to?(:subset) ? values.subset(list) : values & list)]
424
507
  end
425
508
  end
426
509
  end
@@ -433,4 +516,17 @@ module TSV
433
516
  def rank_enrichment(list, options = {})
434
517
  TSV.rank_enrichment(self, list, options)
435
518
  end
519
+
520
+ def ranks_for(field)
521
+ ranks = TSV.setup({}, :key_field => self.key_field, :fields => ["Rank"], :type => :single, :cast => :to_i)
522
+ sort_by(field, true).each_with_index do |k, i|
523
+ ranks[k] = i
524
+ end
525
+
526
+ ranks.entity_options = entity_options
527
+ ranks.entity_templates = entity_templates
528
+ ranks.namespace = namespace
529
+
530
+ ranks
531
+ end
436
532
  end
@@ -5,8 +5,8 @@ module RankProduct
5
5
  scores = {}
6
6
  log_sizes = signature_sizes.collect{|size| Math::log(size)}
7
7
  gene_ranks.each{|gene, positions|
8
- scores[gene] = positions.zip(log_sizes).
9
- collect{|p| Math::log(p[0]) - p[1]}. # Take log and substract from size (normalize)
8
+ scores[gene] = positions.collect{|p| p.nil? or (p.respond_to?(:empty?) and p.empty?) ? signature_sizes.max : p }.zip(log_sizes).
9
+ collect{|p| Math::log(p[0]) - p[1]}.
10
10
  inject(0){|acc, v| acc += v }
11
11
  }
12
12
  scores
@@ -46,7 +46,7 @@ module TSV
46
46
  if block_given?
47
47
  scores = fields.collect{|field| tsv.sort_by(field, true, &block)}
48
48
  else
49
- scores = fields.collect{|field| tsv.sort_by(field, true){|gene,values| tsv.type == :double ? values.first.to_f : value.to_f}}
49
+ scores = fields.collect{|field| tsv.sort_by(field, true){|gene,values| tsv.type == :single ? values.to_f : values.flatten.first.to_f}}
50
50
  end
51
51
  positions = {}
52
52
 
@@ -61,7 +61,10 @@ module TSV
61
61
  end
62
62
  end
63
63
 
64
- score = RankProduct.score(positions, fields.collect{ tsv.size })
64
+ signature_sizes = fields.collect{|field| slice(field).values.select{|v| v and not (v.respond_to?(:empty?) and v.empty?)}.length}
65
+
66
+ #score = RankProduct.score(positions, fields.collect{ tsv.size })
67
+ score = RankProduct.score(positions, signature_sizes)
65
68
 
66
69
  score
67
70
  end
@@ -58,7 +58,6 @@ row7 A B Id3
58
58
  tsv = TSV.open(filename, :sep => /\s+/)
59
59
 
60
60
  assert_equal %w(a), tsv.enrichment(%w(row1 row3 row4 row5), "ValueA", :fdr => false, :background => %w(row1 row2 row3 row4 row5 row6 row7)).collect{|annot, values| pvalue = values.first.first.to_f; pvalue < 0.05 ? annot : nil}.compact
61
- ddd tsv.enrichment(%w(row3 row4 row5), "ValueA", :fdr => false, :background => %w(row1 row2 row3 row4 row5))
62
61
  assert_equal %w(), tsv.enrichment(%w(row1 row3 row4 row5), "ValueA", :fdr => false, :background => %w(row1 row2 row3 row4 row5)).collect{|annot, values| pvalue = values.first.first.to_f; pvalue < 0.05 ? annot : nil}.compact
63
62
  end
64
63
 
metadata CHANGED
@@ -1,20 +1,18 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: rbbt-dm
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.0.0
5
- prerelease:
4
+ version: 1.1.0
6
5
  platform: ruby
7
6
  authors:
8
7
  - Miguel Vazquez
9
8
  autorequire:
10
9
  bindir: bin
11
10
  cert_chain: []
12
- date: 2012-12-21 00:00:00.000000000 Z
11
+ date: 2013-10-21 00:00:00.000000000 Z
13
12
  dependencies:
14
13
  - !ruby/object:Gem::Dependency
15
14
  name: rbbt-util
16
15
  requirement: !ruby/object:Gem::Requirement
17
- none: false
18
16
  requirements:
19
17
  - - ! '>='
20
18
  - !ruby/object:Gem::Version
@@ -22,7 +20,6 @@ dependencies:
22
20
  type: :runtime
23
21
  prerelease: false
24
22
  version_requirements: !ruby/object:Gem::Requirement
25
- none: false
26
23
  requirements:
27
24
  - - ! '>='
28
25
  - !ruby/object:Gem::Version
@@ -30,7 +27,6 @@ dependencies:
30
27
  - !ruby/object:Gem::Dependency
31
28
  name: RubyInline
32
29
  requirement: !ruby/object:Gem::Requirement
33
- none: false
34
30
  requirements:
35
31
  - - ! '>='
36
32
  - !ruby/object:Gem::Version
@@ -38,7 +34,6 @@ dependencies:
38
34
  type: :runtime
39
35
  prerelease: false
40
36
  version_requirements: !ruby/object:Gem::Requirement
41
- none: false
42
37
  requirements:
43
38
  - - ! '>='
44
39
  - !ruby/object:Gem::Version
@@ -46,7 +41,6 @@ dependencies:
46
41
  - !ruby/object:Gem::Dependency
47
42
  name: priority_queue
48
43
  requirement: !ruby/object:Gem::Requirement
49
- none: false
50
44
  requirements:
51
45
  - - ! '>='
52
46
  - !ruby/object:Gem::Version
@@ -54,7 +48,6 @@ dependencies:
54
48
  type: :runtime
55
49
  prerelease: false
56
50
  version_requirements: !ruby/object:Gem::Requirement
57
- none: false
58
51
  requirements:
59
52
  - - ! '>='
60
53
  - !ruby/object:Gem::Version
@@ -62,7 +55,6 @@ dependencies:
62
55
  - !ruby/object:Gem::Dependency
63
56
  name: distribution
64
57
  requirement: !ruby/object:Gem::Requirement
65
- none: false
66
58
  requirements:
67
59
  - - ! '>='
68
60
  - !ruby/object:Gem::Version
@@ -70,7 +62,6 @@ dependencies:
70
62
  type: :runtime
71
63
  prerelease: false
72
64
  version_requirements: !ruby/object:Gem::Requirement
73
- none: false
74
65
  requirements:
75
66
  - - ! '>='
76
67
  - !ruby/object:Gem::Version
@@ -78,7 +69,6 @@ dependencies:
78
69
  - !ruby/object:Gem::Dependency
79
70
  name: png
80
71
  requirement: !ruby/object:Gem::Requirement
81
- none: false
82
72
  requirements:
83
73
  - - ! '>='
84
74
  - !ruby/object:Gem::Version
@@ -86,7 +76,6 @@ dependencies:
86
76
  type: :runtime
87
77
  prerelease: false
88
78
  version_requirements: !ruby/object:Gem::Requirement
89
- none: false
90
79
  requirements:
91
80
  - - ! '>='
92
81
  - !ruby/object:Gem::Version
@@ -99,6 +88,9 @@ extra_rdoc_files:
99
88
  - LICENSE
100
89
  files:
101
90
  - LICENSE
91
+ - lib/rbbt/expression/expression.rb
92
+ - lib/rbbt/expression/matrix.rb
93
+ - lib/rbbt/expression/signature.rb
102
94
  - lib/rbbt/network/paths.rb
103
95
  - lib/rbbt/plots/bar.rb
104
96
  - lib/rbbt/plots/heatmap.rb
@@ -108,42 +100,41 @@ files:
108
100
  - lib/rbbt/statistics/rank_product.rb
109
101
  - lib/rbbt/vector/model.rb
110
102
  - lib/rbbt/vector/model/svm.rb
111
- - test/rbbt/statistics/test_fdr.rb
112
103
  - test/rbbt/statistics/test_hypergeometric.rb
113
104
  - test/rbbt/statistics/test_random_walk.rb
105
+ - test/rbbt/statistics/test_fdr.rb
106
+ - test/rbbt/network/test_paths.rb
114
107
  - test/rbbt/vector/test_model.rb
115
108
  - test/rbbt/vector/model/test_svm.rb
116
- - test/rbbt/network/test_paths.rb
117
109
  - test/test_helper.rb
118
110
  homepage: http://github.com/mikisvaz/rbbt-phgx
119
111
  licenses: []
112
+ metadata: {}
120
113
  post_install_message:
121
114
  rdoc_options: []
122
115
  require_paths:
123
116
  - lib
124
117
  required_ruby_version: !ruby/object:Gem::Requirement
125
- none: false
126
118
  requirements:
127
119
  - - ! '>='
128
120
  - !ruby/object:Gem::Version
129
121
  version: '0'
130
122
  required_rubygems_version: !ruby/object:Gem::Requirement
131
- none: false
132
123
  requirements:
133
124
  - - ! '>='
134
125
  - !ruby/object:Gem::Version
135
126
  version: '0'
136
127
  requirements: []
137
128
  rubyforge_project:
138
- rubygems_version: 1.8.24
129
+ rubygems_version: 2.0.3
139
130
  signing_key:
140
- specification_version: 3
131
+ specification_version: 4
141
132
  summary: Data-mining and statistics
142
133
  test_files:
143
- - test/rbbt/statistics/test_fdr.rb
144
134
  - test/rbbt/statistics/test_hypergeometric.rb
145
135
  - test/rbbt/statistics/test_random_walk.rb
136
+ - test/rbbt/statistics/test_fdr.rb
137
+ - test/rbbt/network/test_paths.rb
146
138
  - test/rbbt/vector/test_model.rb
147
139
  - test/rbbt/vector/model/test_svm.rb
148
- - test/rbbt/network/test_paths.rb
149
140
  - test/test_helper.rb