rbbt-GE 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/LICENSE ADDED
@@ -0,0 +1,20 @@
1
+ Copyright (c) 2010-2011 Miguel Vázquez García
2
+
3
+ Permission is hereby granted, free of charge, to any person obtaining
4
+ a copy of this software and associated documentation files (the
5
+ "Software"), to deal in the Software without restriction, including
6
+ without limitation the rights to use, copy, modify, merge, publish,
7
+ distribute, sublicense, and/or sell copies of the Software, and to
8
+ permit persons to whom the Software is furnished to do so, subject to
9
+ the following conditions:
10
+
11
+ The above copyright notice and this permission notice shall be
12
+ included in all copies or substantial portions of the Software.
13
+
14
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
15
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
16
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
17
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
18
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
19
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
20
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
@@ -0,0 +1,283 @@
1
+ require 'rbbt-util'
2
+ require 'rbbt/GE'
3
+ require 'rbbt/sources/organism'
4
+ require 'rbbt/resource'
5
+ require 'yaml'
6
+
7
+ module GEO
8
+ extend Resource
9
+ self.pkgdir = "geo"
10
+ self.subdir = "arrays"
11
+
12
+ GEO.claim GEO.root.find(:user), :rake, Rbbt.share.install.GEO.Rakefile.find(:lib)
13
+
14
+ def self.comparison_name(field, condition, control)
15
+ condition = condition * " AND " if Array === condition
16
+ control = control * " AND " if Array === control
17
+ [[field, condition] * ": ", [field, control] * ": "] * " => "
18
+ end
19
+
20
+ def self.parse_comparison_name(name)
21
+ field1, condition1, field2, condition2 = name.match(/(.*): (.*?) => (.*?): (.*)/).values_at(1, 2, 3, 4)
22
+ condition1 = condition1.split(/ AND /) if condition1 =~ / AND /
23
+ condition2 = condition2.split(/ AND /) if condition2 =~ / AND /
24
+
25
+ [field1, condition1, field2, condition2]
26
+ end
27
+
28
+ def self.platform_info(platform)
29
+ YAML.load(self[platform]['info.yaml'].produce.read)
30
+ end
31
+
32
+ def self.dataset_info(dataset)
33
+ YAML.load(self[dataset]['info.yaml'].produce.read)
34
+ end
35
+
36
+ def self.is_control?(value, info)
37
+ value.to_s.downcase =~ /\bcontrol\b/ or
38
+ value.to_s.downcase =~ /\bwild/ or
39
+ value.to_s.downcase =~ /\bnone\b/
40
+ end
41
+
42
+ def self.control_samples(dataset)
43
+ info = dataset_info(dataset)
44
+ subsets = info[:subsets]
45
+
46
+ control_samples = []
47
+ subsets.each do |type, values|
48
+ control_samples.concat values.select{|value,samples| is_control? value, info}.collect{|value,samples| samples.split(",")}.flatten
49
+ end
50
+
51
+ control_samples
52
+ end
53
+
54
+ module SOFT
55
+
56
+ GDS_URL="ftp://ftp.ncbi.nih.gov/pub/geo/DATA/SOFT/GDS/#DATASET#.soft.gz"
57
+ GPL_URL="ftp://ftp.ncbi.nih.gov/pub/geo/DATA/SOFT/by_platform/#PLATFORM#/#PLATFORM#_family.soft.gz"
58
+ GSE_URL="ftp://ftp.ncbi.nih.gov/pub/geo/DATA/SOFT/by_series/#SERIES#/#SERIES#_family.soft.gz"
59
+
60
+ GSE_INFO = {
61
+ :title => "!Series_title",
62
+ :channel_count => "!Sample_channel_count",
63
+ :value_type => "!Series_value_type",
64
+ :platform => "!Series_platform_id",
65
+ :description => "!Series_summary*", # Join with \n
66
+ }
67
+
68
+ GDS_INFO = {
69
+ :DELIMITER => "\\^SUBSET",
70
+ :value_type => "!dataset_value_type",
71
+ :channel_count => "!dataset_channel_count",
72
+ :platform => "!dataset_platform",
73
+ :reference_series => "!dataset_reference_series",
74
+ :description => "!dataset_description",
75
+ }
76
+
77
+ GDS_SUBSET_INFO = {
78
+ :DELIMITER => "!subset_.*|!dataset_value_type",
79
+ :description => "!subset_description",
80
+ :samples => "!subset_sample_id*",
81
+ :type => "!subset_type",
82
+ }
83
+
84
+ GPL_INFO = {
85
+ :DELIMITER => "!platform_table_begin",
86
+ :organism => "!Platform_organism",
87
+ :count => "!Platform_data_row_count"
88
+ }
89
+
90
+ # When multiple matches select most common, unless join is choosen
91
+ def self.find_field(header, field, join = false)
92
+ md = header.match(/#{ Regexp.quote field }\s*=\s*(.*)/i)
93
+ return nil if md.nil? or md.captures.empty?
94
+
95
+ case join
96
+ when false, nil
97
+ counts = Hash.new(0)
98
+ md.captures.sort_by{|v| counts[v] += 1}.first
99
+ when true
100
+ md.captures * "\n"
101
+ else
102
+ md.captures * join
103
+ end
104
+ end
105
+
106
+ def self.get_info(header, info)
107
+ result = {}
108
+
109
+ info.each do |key, field|
110
+ next if key == :DELIMITER
111
+ if field =~ /(.*)\*(.*)(\*)?$/
112
+ value = find_field(header, $1, $2.empty? ? true : $2)
113
+ value = value.to_i.to_s == value ? value.to_i : value
114
+ if $3
115
+ result[key] = value.split(',')
116
+ else
117
+ result[key] = value
118
+ end
119
+ else
120
+ value = find_field(header, field, false)
121
+ value = value.to_i.to_s == value ? value.to_i : value
122
+ result[key] = value
123
+ end
124
+ end
125
+
126
+ if result.empty?
127
+ nil
128
+ else
129
+ result
130
+ end
131
+ end
132
+
133
+ def self.parse_header(stream, info)
134
+ header = ""
135
+ while line = stream.readline
136
+ header << line
137
+ break if line =~ /^#{info[:DELIMITER]}/i
138
+ raise "Delimiter not found" if stream.eof?
139
+ end
140
+
141
+ get_info(header, info)
142
+ end
143
+
144
+ def self.guess_id(organism, codes)
145
+ num_codes = codes.size
146
+ best = nil
147
+ best_count = 0
148
+ new_fields = []
149
+ field_counts = {}
150
+ TmpFile.with_file(codes.to_s) do |codefile|
151
+
152
+ codes.all_fields.each_with_index do |field,i|
153
+ values = CMD.cmd("cat #{ codefile }|cut -f #{ i + 1 }| tr '|' '\\n'|grep [[:alpha:]]|sort -u").read.split("\n").reject{|code| code.empty?}
154
+
155
+ new_field, count = Organism.guess_id(organism, values)
156
+ field_counts[new_field] = count
157
+ Log.debug "Original field: #{ field }. New: #{new_field}. Count: #{ count }/#{num_codes}"
158
+ new_fields << (count > (num_codes > 20000 ? 20000 : num_codes).to_f * 0.5 ? new_field : "UNKNOWN(#{ field })")
159
+ if count > best_count
160
+ best = new_field
161
+ best_count = count
162
+ end
163
+
164
+ end
165
+
166
+ end
167
+
168
+ field_counts.delete(new_fields.first)
169
+ [best, new_fields, field_counts.sort_by{|field, counts| counts}.collect{|field, counts| field}.compact]
170
+ end
171
+
172
+ #{{{ GPL
173
+
174
+ def self.GPL(platform, directory)
175
+ FileUtils.mkdir_p directory unless File.exists? directory
176
+
177
+ code_file = File.join(directory, 'codes')
178
+ info_file = File.join(directory, 'info.yaml')
179
+
180
+ stream = Open.open(GPL_URL.gsub('#PLATFORM#', platform), :nocache => true, :pipe => true)
181
+
182
+ info = parse_header(stream, GPL_INFO)
183
+ info[:code_file] = code_file
184
+ info[:data_directory] = directory
185
+
186
+ Log.medium "Producing code file for #{ platform }"
187
+ codes = TSV.open stream, :fix => proc{|l| l =~ /^!platform_table_end/i ? nil : l}, :header_hash => ""
188
+ Log.low "Original fields: #{codes.key_field} - #{codes.fields * ", "}"
189
+ stream.force_close
190
+
191
+ best_field, all_new_fields, order = guess_id(Organism.organism(info[:organism]), codes)
192
+
193
+ new_key_field, *new_fields = all_new_fields
194
+
195
+ new_key_field = codes.key_field if new_key_field =~ /^UNKNOWN/
196
+
197
+ codes.key_field = new_key_field.dup
198
+ codes.fields = new_fields.collect{|f| f.dup}
199
+
200
+ Log.low "New fields: #{codes.key_field} - #{codes.fields * ", "}"
201
+
202
+ Open.write(code_file, codes.reorder(:key, order).to_s(:sort, true))
203
+ Open.write(info_file, info.to_yaml)
204
+
205
+ info
206
+ end
207
+
208
+ def self.dataset_subsets(stream)
209
+ text = ""
210
+ while not (line = stream.gets) =~ /!dataset_table_begin/
211
+ text << line
212
+ end
213
+
214
+ subsets = text.split(/\^SUBSET/).collect do |chunk|
215
+ get_info(chunk, GDS_SUBSET_INFO)
216
+ end
217
+
218
+ info = {}
219
+ subsets.each do |subset|
220
+ type = subset[:type]
221
+ description = subset[:description]
222
+ samples = subset[:samples]
223
+ info[type] ||= {}
224
+ info[type][description] = samples
225
+ end
226
+
227
+ info
228
+ end
229
+
230
+ def self.GDS(dataset, directory)
231
+ FileUtils.mkdir_p directory unless File.exists? directory
232
+
233
+ value_file = File.join(directory, 'values')
234
+ info_file = File.join(directory, 'info.yaml')
235
+
236
+ stream = Open.open(GDS_URL.gsub('#DATASET#', dataset), :nocache => true)
237
+
238
+ info = parse_header(stream, GDS_INFO)
239
+ info[:value_file] = value_file
240
+ info[:data_directory] = directory
241
+
242
+ info[:subsets] = dataset_subsets(stream)
243
+
244
+ Log.medium "Producing values file for #{ dataset }"
245
+ values = TSV.open stream, :fix => proc{|l| l =~ /^!dataset_table_end/i ? nil : l.gsub(/null/,'NA')}, :header_hash => ""
246
+ key_field = TSV.parse_header(GEO[info[:platform]]['codes'].open).key_field
247
+ values.key_field = key_field
248
+
249
+ samples = values.fields.select{|f| f =~ /GSM/}
250
+
251
+ Open.write(value_file, values.slice(samples).to_s(:sort, true))
252
+ Open.write(info_file, info.to_yaml)
253
+
254
+ info
255
+ end
256
+ end
257
+
258
+ def self.compare(dataset, field, condition, control, path)
259
+ dataset_info = GEO[dataset]["info.yaml"].yaml
260
+
261
+ platform = dataset_info[:platform]
262
+ platform_info = GEO[platform]["info.yaml"].yaml
263
+
264
+ log2 = ["count"].include? dataset_info[:value_type]
265
+ samples = dataset_info[:subsets]
266
+ value_file = GEO[dataset].values.find.produce
267
+ format = TSV.parse_header(GEO[platform].codes.open).key_field
268
+
269
+ if Array === condition
270
+ condition_samples = condition.collect{|cond| samples[field][cond].split ","}.flatten
271
+ else
272
+ condition_samples = samples[field][condition].split ","
273
+ end
274
+
275
+ if Array === control
276
+ control_samples = control.collect{|cond| samples[field][cond].split ","}.flatten
277
+ else
278
+ control_samples = samples[field][control].split ","
279
+ end
280
+
281
+ GE.analyze(value_file, condition_samples, control_samples, log2, path, format)
282
+ end
283
+ end
data/lib/rbbt/GE.rb ADDED
@@ -0,0 +1,36 @@
1
+ require 'rbbt/util/R'
2
+
3
+ module GE
4
+ LIB_DIR = File.join(File.expand_path(File.dirname(__FILE__)),'../../share/lib/R')
5
+ MA = File.join(LIB_DIR, 'MA.R')
6
+
7
+ def self.run_R(command)
8
+ cmd = "source('#{MA}');" << command
9
+ R.run(cmd)
10
+ end
11
+
12
+ def self.r_format(list)
13
+ case
14
+ when list.nil?
15
+ "NULL"
16
+ when Array === list
17
+ "c(#{list.collect{|e| r_format e} * ", "})"
18
+ when (String === list and list === list.to_i.to_s)
19
+ list.to_i
20
+ when (String === list and list === list.to_f.to_s)
21
+ list.to_f
22
+ when TrueClass === list
23
+ "TRUE"
24
+ when FalseClass === list
25
+ "FALSE"
26
+ else
27
+ "'#{list.to_s}'"
28
+ end
29
+ end
30
+
31
+ def self.analyze(datafile, main, contrast = nil, log2 = false, outfile = nil, key_field = nil)
32
+ FileUtils.mkdir_p File.dirname(outfile) unless outfile.nil? or File.exists? File.dirname(outfile)
33
+ GE.run_R("rbbt.GE.process(#{ r_format datafile }, main = #{r_format(main)}, contrast = #{r_format(contrast)}, log2=#{ r_format log2 }, outfile = #{r_format outfile}, key.field = #{r_format key_field})")
34
+ end
35
+ end
36
+
@@ -0,0 +1,24 @@
1
+
2
+ rule /^(GPL\d+)\/?(codes|info\.yaml)?$/ do |t|
3
+ t.name =~ /^(GPL\d+)\/?(codes|info\.yaml)?/
4
+ platform = $1
5
+ file = $2
6
+ GEO::SOFT.GPL(platform, file.nil? ? t.name : File.dirname(t.name))
7
+ end
8
+
9
+ rule /^(GDS\d+)\/?(values|info\.yaml)?$/ do |t|
10
+ t.name =~ /^(GDS\d+)\/?(values|info\.yaml)?/
11
+ dataset = $1
12
+ file = $2
13
+ GEO::SOFT.GDS(dataset, file.nil? ? t.name : File.dirname(t.name))
14
+ end
15
+
16
+ rule /^(GDS\d+)\/comparison\/(.*)$/ do |t|
17
+ t.name =~ /^(GDS\d+)\/comparison\/(.*)/
18
+ dataset = $1
19
+ name = $2
20
+
21
+ condition_field, condition_name, control_field, control_name = GEO.parse_comparison_name name
22
+
23
+ GEO.compare(dataset, condition_field, condition_name, control_name, t.name)
24
+ end
data/share/lib/R/MA.R ADDED
@@ -0,0 +1,515 @@
1
+ library(limma)
2
+
3
+ #########################################################################
4
+ # Model processing
5
+
6
+ # Ratio
7
+ rbbt.GE.process.ratio.oneside <- function(expr){
8
+ ratio = apply(expr, 1 ,function(x){mean(x, na.rm = TRUE)})
9
+ names(ratio) <- rownames(expr);
10
+ return(ratio);
11
+ }
12
+
13
+ rbbt.GE.process.ratio.twoside <- function(expr, contrast){
14
+ ratio = rbbt.GE.process.ratio.oneside(expr) - rbbt.GE.process.ratio.oneside(contrast)
15
+ names(ratio) <- rownames(expr);
16
+ return(ratio);
17
+ }
18
+
19
+ # Limma
20
+ rbbt.GE.process.limma.oneside <- function(expr, subset = NULL){
21
+
22
+ if (is.null(subset)){
23
+ fit <- lmFit(expr);
24
+ }else{
25
+ design = rep(0, dim(expr)[2]);
26
+ design[names(expr) %in% subset] = 1;
27
+ }
28
+
29
+ fit <- lmFit(expr, design);
30
+
31
+ fit <- eBayes(fit);
32
+
33
+ sign = fit$t < 0;
34
+ sign[is.na(sign)] = FALSE;
35
+ fit$p.value[sign] = - fit$p.value[sign];
36
+
37
+ return(list(t= fit$t, p.values= fit$p.value));
38
+ }
39
+
40
+ rbbt.GE.process.limma.twoside <- function(expr, subset.main, subset.contrast){
41
+
42
+ design = cbind(rep(1,dim(expr)[2]), rep(0,dim(expr)[2]));
43
+ colnames(design) <-c('intercept', 'expr');
44
+ design[names(expr) %in% subset.main,] = 1;
45
+ design[names(expr) %in% subset.contrast,'intercept'] = 1;
46
+
47
+ fit <- lmFit(expr, design);
48
+
49
+ fit <- eBayes(fit);
50
+ sign = fit$t[,2] < 0;
51
+ sign[is.na(sign)] = FALSE;
52
+ fit$p.value[sign,2] = - fit$p.value[sign,2];
53
+
54
+ return(list(t= fit$t[,2], p.values= fit$p.value[,2]));
55
+ }
56
+
57
+
58
+
59
+ rbbt.GE.process <- function(file, main, contrast = NULL, log2 = FALSE, outfile = NULL, key.field = NULL){
60
+ data = rbbt.tsv(file);
61
+ ids = rownames(data);
62
+
63
+ if (log2){
64
+ data = log2(data);
65
+ }
66
+
67
+ if (is.null(contrast)){
68
+ ratio = rbbt.GE.process.ratio.oneside(subset(data, select=main));
69
+ }else{
70
+ ratio = rbbt.GE.process.ratio.twoside(subset(data, select=main), subset(data, select=contrast) );
71
+ }
72
+
73
+ if (is.null(contrast)){
74
+ limma = NULL;
75
+ tryCatch({
76
+ limma = rbbt.GE.process.limma.oneside(data, main);
77
+ }, error=function(x){
78
+ cat("Limma failed for complete dataset. Trying just subset.\n", file=stderr());
79
+ print(x, file=stderr());
80
+ tryCatch({
81
+ limma = rbbt.GE.process.limma.oneside(subset(data, select=main));
82
+ }, error=function(x){
83
+ cat("Limma failed for subset dataset.\n", file=stderr());
84
+ print(x, file=stderr());
85
+ });
86
+ })
87
+ }else{
88
+ limma = NULL;
89
+ tryCatch({
90
+ limma = rbbt.GE.process.limma.twoside(data, main, contrast);
91
+ }, error=function(x){
92
+ cat("Limma failed for complete dataset. Trying just subset.\n", file=stderr());
93
+ print(x, file=stderr());
94
+ tryCatch({
95
+ limma = rbbt.GE.process.limma.twoside(subset(data, select=c(main, contrast)), main, contrast);
96
+ }, error=function(x){
97
+ cat("Limma failed for subset dataset.\n", file=stderr());
98
+ print(x, file=stderr());
99
+ });
100
+ })
101
+
102
+ }
103
+
104
+ if (! is.null(limma)){
105
+ result = data.frame(ratio = ratio[ids], t.values = limma$t[ids], p.values = limma$p.values[ids])
106
+ }else{
107
+ result = data.frame(ratio = ratio)
108
+ }
109
+
110
+ if (is.null(outfile)){
111
+ return(result);
112
+ }else{
113
+ rbbt.tsv.write(outfile, result, key.field, ":type=:list#:cast=:to_f");
114
+ return(NULL);
115
+ }
116
+ }
117
+
118
+
119
+
120
+ ############################################################################
121
+ ############################################################################
122
+ ############################################################################
123
+ ############################################################################
124
+ ############################################################################
125
+ # OLD STUFF
126
+
127
+
128
+ MA.get_order <- function(values){
129
+ orders = values;
130
+ orders[,] = NA;
131
+
132
+ for (i in 1:dim(values)[2]){
133
+ positions = names(sort(values[,i],decreasing=T,na.last=NA));
134
+ orders[,i] = NA;
135
+ orders[positions,i] = 1:length(positions)
136
+ }
137
+ orders
138
+ }
139
+
140
+ MA.guess.do.log2 <- function(m, two.channel){
141
+ if (two.channel){
142
+ return (sum(m < 0, na.rm = TRUE) == 0);
143
+ }else{
144
+ return (max(m, na.rm = TRUE) > 100);
145
+ }
146
+ }
147
+
148
+ MA.translate <- function(m, trans){
149
+ trans[trans==""] = NA;
150
+ trans[trans=="NO MATCH"] = NA;
151
+
152
+ missing = length(trans) - dim(m)[1];
153
+
154
+ # If extra genes
155
+ if (missing < 0){
156
+ trans = c(trans,rep(NA, - missing));
157
+ missing = 0;
158
+ }
159
+ n = apply(m,2,function(x){
160
+ # Complete data with missing genes
161
+ x.complete = c(x,rep(NA, missing));
162
+ tapply(x.complete, factor(trans), median)
163
+ });
164
+ n[sort(rownames(n),index.return=T)$ix,]
165
+ }
166
+
167
+ # Conditions
168
+
169
+ MA.conditions.has_control <- function(x){
170
+ keywords = c('none', 'control', 'normal', 'wild', 'baseline', 'untreat', 'uninfected', 'universal', 'reference', 'vehicle', 'w.t.','wt');
171
+ for(keyword in keywords){
172
+ control = grep(keyword, x, ignore.case = TRUE);
173
+ if (any(control)){
174
+ return(x[control[1]]);
175
+ }
176
+ }
177
+ return(NULL)
178
+ }
179
+
180
+ MA.condition.values <- function(values){
181
+ control = MA.conditions.has_control(values);
182
+
183
+ values.factor = factor(values);
184
+ values.levels = levels(values.factor);
185
+
186
+ # If there is a control state remove it from sorting
187
+ if (!is.null(control))
188
+ values.levels = values.levels[values.levels != control];
189
+
190
+
191
+ # Use numeric sort if they all have numbers
192
+ if (length(grep('^ *[0-9]+',values.levels,perl=TRUE)) == length(values.levels)){
193
+ ix = sort(as.numeric(sub('^ *([0-9]+).*',"\\1",values.levels)), decreasing = T, index.return = TRUE)$ix
194
+ }else{
195
+ ix = sort(values.levels, decreasing = T, index.return = TRUE)$ix
196
+ }
197
+
198
+ return(list(values = values.levels[ix], control = control));
199
+ }
200
+
201
+
202
+ #########################################################################
203
+ # Model processing
204
+
205
+ # Ratio
206
+ MA.ratio.two_channel <- function(m, conditions, main){
207
+ main = m[,conditions==main];
208
+ if (!is.null(dim(main))){
209
+ main = apply(main, 1 ,function(x){mean(x, na.rm = TRUE)});
210
+ }
211
+ return(main);
212
+ }
213
+
214
+ MA.ratio.contrast <- function(m, conditions, main, contrast){
215
+ main = m[,conditions==main];
216
+ if (!is.null(dim(main))){
217
+ main = apply(main, 1 ,function(x){mean(x, na.rm = TRUE)});
218
+ }
219
+
220
+ contrast = m[,conditions==contrast];
221
+ if (!is.null(dim(contrast))){
222
+ contrast = apply(contrast, 1 ,function(x){mean(x, na.rm = TRUE)});
223
+ }
224
+
225
+ return (main - contrast);
226
+ }
227
+
228
+
229
+ # Limma
230
+
231
+ MA.limma.two_channel <- function(m, conditions, main){
232
+ if (sum(conditions == main) < 3){
233
+ return(NULL);
234
+ }
235
+
236
+ design = rep(0,dim(m)[2]);
237
+ design[conditions == main] = 1;
238
+
239
+ # We need to subset the columns because of a problem with NA values. This
240
+ # might affect eBayes variance estimations, thats my guess anyway...
241
+
242
+ fit <- lmFit(m[,design == 1],rep(1, sum(design)));
243
+
244
+ tryCatch({
245
+ fit <- eBayes(fit);
246
+ sign = fit$t < 0;
247
+ sign[is.na(sign)] = FALSE;
248
+ fit$p.value[sign] = - fit$p.value[sign];
249
+ return(list(t= fit$t, p.values= fit$p.value));
250
+ }, error=function(x){
251
+ print("Exception caught in eBayes");
252
+ print(x);
253
+ })
254
+
255
+ return(NULL);
256
+ }
257
+
258
+ MA.limma.contrast <- function(m, conditions, main, contrast){
259
+ if (sum(conditions == main) + sum(conditions == contrast) < 3){
260
+ return(NULL);
261
+ }
262
+ m = cbind(m[,conditions == main],m[,conditions == contrast]);
263
+
264
+ design = cbind(rep(1,dim(m)[2]), rep(0,dim(m)[2]));
265
+ colnames(design) <-c('intercept', 'main');
266
+ design[1:sum(conditions==main),2] = 1;
267
+
268
+
269
+ fit <- lmFit(m,design);
270
+ tryCatch({
271
+ fit <- eBayes(fit);
272
+ sign = fit$t[,2] < 0;
273
+ sign[is.na(sign)] = FALSE;
274
+ fit$p.value[sign,2] = - fit$p.value[sign,2]
275
+ return(list(t= fit$t[,2], p.values= fit$p.value[,2] ));
276
+ }, error=function(x){
277
+ print("Exception caught in eBayes");
278
+ print(x);
279
+ })
280
+
281
+ return(NULL);
282
+ }
283
+
284
+
285
+ #########################################################################
286
+ # Process conditions
287
+
288
+ MA.strip_blanks <- function(text){
289
+ text = sub(' *$', '' ,text);
290
+ text = sub('^ *', '' ,text);
291
+
292
+ return(text);
293
+ }
294
+
295
+ MA.orders <- function(ratios, t){
296
+ best = vector();
297
+ names = vector();
298
+ for (name in colnames(ratios)){
299
+ if (sum(colnames(t) == name) > 0){
300
+ best = cbind(best, t[,name]);
301
+ names = c(names, name);
302
+ }else{
303
+ best = cbind(best, ratios[,name]);
304
+ names = c(names, paste(name,'[ratio]', sep=" "));
305
+ }
306
+ }
307
+ rownames(best) <- rownames(ratios);
308
+ orders <- as.data.frame(MA.get_order(best));
309
+ colnames(orders) <- names;
310
+
311
+ return(orders);
312
+ }
313
+
314
+ MA.process_conditions.contrasts <- function(m, conditions, two.channel){
315
+ max_levels = 10;
316
+ max_levels_control = 1;
317
+
318
+
319
+ values = MA.condition.values(conditions);
320
+
321
+
322
+ ratios = vector();
323
+ t = vector();
324
+ p.values = vector();
325
+
326
+ ratio_names = vector();
327
+ t_names = vector();
328
+
329
+ if (!is.null(values$control)){
330
+ contrast = values$control;
331
+ for (main in values$values){
332
+ name = paste(main, contrast, sep = " <=> ")
333
+
334
+ ratio = MA.ratio.contrast(m, conditions, main, contrast);
335
+ ratio_names = c(ratio_names, name);
336
+ ratios = cbind(ratios, ratio);
337
+
338
+ res = MA.limma.contrast(m, conditions, main, contrast);
339
+ if (!is.null(res)){
340
+ t_names = c(t_names, name);
341
+ t = cbind(t, res$t);
342
+ p.values = cbind(p.values, res$p.values);
343
+ }
344
+ }
345
+ }
346
+
347
+
348
+ if (length(values$values) <= max_levels_control || (is.null(values$control) && !two.channel && length(values$values) <= max_levels )){
349
+
350
+ remaining = values$values;
351
+ for (main in values$values){
352
+ remaining = remaining[remaining != main];
353
+ for (contrast in remaining){
354
+ name = paste(main, contrast, sep = " <=> ");
355
+
356
+ ratio = MA.ratio.contrast(m, conditions, main, contrast);
357
+ ratio_names = c(ratio_names, name);
358
+ ratios = cbind(ratios, ratio);
359
+
360
+ res = MA.limma.contrast(m, conditions, main, contrast);
361
+ if (!is.null(res)){
362
+ t_names = c(t_names, name);
363
+ t = cbind(t, res$t);
364
+ p.values = cbind(p.values, res$p.values);
365
+ }
366
+ }
367
+ }
368
+ }
369
+
370
+
371
+ if (length(ratio_names) != 0){
372
+ ratio_names = as.vector(sapply(ratio_names, MA.strip_blanks));
373
+ colnames(ratios) <- ratio_names
374
+ }
375
+
376
+ if (length(t_names) != 0){
377
+ t_names = as.vector(sapply(t_names, MA.strip_blanks));
378
+ colnames(t) <- t_names;
379
+ colnames(p.values) <- t_names;
380
+ }
381
+
382
+
383
+ return(list(ratios = ratios, t=t, p.values = p.values));
384
+ }
385
+
386
+ MA.process_conditions.two_channel <- function(m, conditions){
387
+ values = MA.condition.values(conditions);
388
+
389
+ all_values = values$values;
390
+ if (!is.null(values$control)){
391
+ all_values = c(all_values, values$control);
392
+ }
393
+
394
+
395
+ ratios = vector();
396
+ t = vector();
397
+ p.values = vector();
398
+
399
+ ratio_names = vector();
400
+ t_names = vector();
401
+
402
+
403
+ for (main in all_values){
404
+ name = main;
405
+
406
+ ratio = MA.ratio.two_channel(m, conditions, main);
407
+ ratio_names = c(ratio_names, name);
408
+ ratios = cbind(ratios, ratio);
409
+
410
+ res = MA.limma.two_channel(m, conditions, main);
411
+ if (!is.null(res)){
412
+ t_names = c(t_names, name);
413
+ t = cbind(t, res$t);
414
+ p.values = cbind(p.values, res$p.values);
415
+ }
416
+ }
417
+
418
+ if (length(ratio_names) != 0){
419
+ ratio_names = as.vector(sapply(ratio_names, MA.strip_blanks));
420
+ colnames(ratios) <- ratio_names
421
+ }
422
+
423
+ if (length(t_names) != 0){
424
+ t_names = as.vector(sapply(t_names, MA.strip_blanks));
425
+ colnames(t) <- t_names;
426
+ colnames(p.values) <- t_names;
427
+ }
428
+
429
+ return(list(ratios = ratios, t=t, p.values = p.values));
430
+ }
431
+
432
+
433
+
434
+ # Process microarray matrix
435
+
436
+ MA.process <- function(m, conditions_list, two.channel = FALSE){
437
+
438
+ ratios = vector();
439
+ t = vector();
440
+ p.values = vector();
441
+
442
+ for(type in colnames(conditions_list)){
443
+ conditions = conditions_list[,type]
444
+
445
+ if (two.channel){
446
+ res = MA.process_conditions.two_channel(m, conditions);
447
+ if (length(res$ratios) != 0){ colnames(res$ratios) <- sapply(colnames(res$ratios),function(x){paste(type,x,sep=": ")}); ratios = cbind(ratios,res$ratios);}
448
+ if (length(res$t) != 0){ colnames(res$t) <- sapply(colnames(res$t),function(x){paste(type,x,sep=": ")}); t = cbind(t,res$t);}
449
+ if (length(res$p.values) != 0){ colnames(res$p.values) <- sapply(colnames(res$p.values),function(x){paste(type,x,sep=": ")}); p.values = cbind(p.values,res$p.values);}
450
+ }
451
+
452
+ res = MA.process_conditions.contrasts(m, conditions, two.channel);
453
+ if (length(res$ratios) != 0){ colnames(res$ratios) <- sapply(colnames(res$ratios),function(x){paste(type,x,sep=": ")}); ratios = cbind(ratios,res$ratios);}
454
+ if (length(res$t) != 0){ colnames(res$t) <- sapply(colnames(res$t),function(x){paste(type,x,sep=": ")}); t = cbind(t,res$t);}
455
+ if (length(res$p.values) != 0){ colnames(res$p.values) <- sapply(colnames(res$p.values),function(x){paste(type,x,sep=": ")}); p.values = cbind(p.values,res$p.values);}
456
+ }
457
+
458
+ orders <- MA.orders(ratios,t);
459
+ return(list(ratios = ratios, t=t, p.values = p.values, orders=orders));
460
+ }
461
+
462
+
463
+ MA.save <- function(prefix, orders, ratios, t , p.values, experiments, description = NULL) {
464
+ if (is.null(orders)){
465
+ cat("No suitable samples for analysis\n")
466
+ write(file=paste(prefix,'skip',sep="."), "No suitable samples for analysis" );
467
+ } else {
468
+ write.table(file=paste(prefix,'orders',sep="."), orders, sep="\t", row.names=F, col.names=F, quote=F);
469
+ write.table(file=paste(prefix,'codes',sep="."), rownames(orders), sep="\t", row.names=F, col.names=F, quote=F);
470
+ write.table(file=paste(prefix,'logratios',sep="."), ratios, sep="\t", row.names=F, col.names=F, quote=F);
471
+ write.table(file=paste(prefix,'t',sep="."), t, sep="\t", row.names=F, col.names=F, quote=F);
472
+ write.table(file=paste(prefix,'pvalues',sep="."), p.values, sep="\t", row.names=F, col.names=F, quote=F);
473
+ write.table(file=paste(prefix,'experiments',sep="."), experiments, sep="\t", row.names=F, col.names=F, quote=F);
474
+
475
+ write(file=paste(prefix,'description',sep="."), description)
476
+ }
477
+ }
478
+
479
+ MA.load <- function(prefix, orders = TRUE, logratios = TRUE, t = TRUE, p.values = TRUE){
480
+ data = list();
481
+ genes <- scan(file=paste(prefix,'codes',sep="."),sep="\n",quiet=T,what=character());
482
+ experiments <- scan(file=paste(prefix,'experiments',sep="."),sep="\n",quiet=T,what=character());
483
+
484
+ experiments.no.ratio = experiments[- grep('ratio', experiments)];
485
+
486
+ if (orders){
487
+ orders <- read.table(file=paste(prefix,'orders',sep="."),sep="\t");
488
+ rownames(orders) <- genes;
489
+ colnames(orders) <- experiments;
490
+ data$orders=orders;
491
+ }
492
+ if (logratios){
493
+ logratios <- read.table(file=paste(prefix,'logratios',sep="."),sep="\t");
494
+ rownames(logratios) <- genes;
495
+ colnames(logratios) <- experiments;
496
+ data$logratios=logratios;
497
+ }
498
+ if (t){
499
+ t <- read.table(file=paste(prefix,'t',sep="."),sep="\t");
500
+ rownames(t) <- genes;
501
+ colnames(t) <- experiments.no.ratio;
502
+ data$t=t;
503
+ }
504
+ if (p.values){
505
+ p.values <- read.table(file=paste(prefix,'pvalues',sep="."),sep="\t");
506
+ rownames(p.values) <- genes;
507
+ colnames(p.values) <- experiments.no.ratio;
508
+ data$p.values=p.values;
509
+ }
510
+
511
+
512
+ return(data);
513
+
514
+
515
+ }
@@ -0,0 +1,104 @@
1
+ require File.join(File.expand_path(File.dirname(__FILE__)), '../..', 'test_helper.rb')
2
+ require 'rbbt/GE/GEO'
3
+
4
+ class TestClass < Test::Unit::TestCase
5
+
6
+ def test_control_sample
7
+ assert GEO.control_samples('GDS750').include? "GSM16978"
8
+ end
9
+
10
+ def test_GDS
11
+ assert_equal 'GPL999', GEO.dataset_info('GDS750')[:platform]
12
+ end
13
+
14
+ def test_GPL
15
+ assert_equal 'Saccharomyces cerevisiae', GEO["GPL999/info.yaml"].yaml[:organism]
16
+ assert_equal 'Homo sapiens', GEO["GPL570/info.yaml"].yaml[:organism]
17
+ assert GEO.GPL999.codes.fields.include? "Ensembl Gene ID"
18
+ end
19
+
20
+ def test_normalize
21
+ dataset = 'GDS750'
22
+ gene = "YPR191W"
23
+ id = "6079"
24
+
25
+ platform = GEO.GDS(dataset)[:platform]
26
+ translated = GEO.normalize(platform, ["YPR191W"]).first.first
27
+
28
+ assert_equal id, translated
29
+ end
30
+
31
+ def test_analyze_single
32
+ dataset = 'GDS750'
33
+ info = GEO.GDS(dataset)
34
+
35
+ assert GE.analyze(info[:data_file], info[:subsets]["agent"]["tunicamycin"] ).read =~ /1234/;
36
+ end
37
+
38
+ def test_analyze_contrast
39
+ dataset = 'GDS750'
40
+ info = GEO.GDS(dataset)
41
+ outfile = File.join(File.dirname(info[:data_file]), 'results')
42
+ key_field = TSV.headers(GEO.GPL(info[:platform])[:code_file]).first
43
+
44
+ TmpFile.with_file do |f|
45
+ GE.analyze(info[:data_file], info[:subsets]["agent"]["tunicamycin"], info[:subsets]["agent"]["DTT"], false, f, key_field);
46
+ assert File.exists? f
47
+ FileUtils.rm f
48
+ end
49
+ end
50
+
51
+ def test_process_subset
52
+ dataset = 'GDS750'
53
+ subset = 'agent'
54
+ id = "6079"
55
+ info = GEO.GDS(dataset)
56
+ outfile = File.join(File.dirname(info[:data_file]), 'results')
57
+ key_field = TSV.headers(GEO.GPL(info[:platform])[:code_file]).first
58
+
59
+ TmpFile.with_file do |f|
60
+ GEO.process_subset(dataset, subset, nil, f)
61
+ assert File.exists? f
62
+ FileUtils.rm f
63
+ end
64
+
65
+ t = GEO.process_subset(dataset, subset, 'tunicamycin')
66
+ assert File.exists? File.join(File.dirname(info[:data_file]), 'analyses/subset.agent.tunicamycin')
67
+ d = GEO.process_subset(dataset, subset, 'DTT')
68
+ assert File.exists? File.join(File.dirname(info[:data_file]), 'analyses/subset.agent.DTT')
69
+
70
+ assert_in_delta t[id]["p.values"], - d[id]["p.values"], 0.0001
71
+ end
72
+
73
+ def test_GSE
74
+ gse="GSE966"
75
+ info = GEO.GSE(gse)
76
+ assert_equal "GPL764", info[:platform]
77
+ end
78
+
79
+
80
+ #{{{ NEW TEST
81
+
82
+ def test_GSE
83
+ gse="GSE966"
84
+ info = GEO.GSE(gse)
85
+ assert_equal "GPL764", info[:platform]
86
+ end
87
+
88
+ def test_compare
89
+ dataset = "GDS1479"
90
+ field = "specimen"
91
+ condition = "carcinoma in situ lesion"
92
+ control = "normal mucosa"
93
+
94
+ TmpFile.with_file do |path|
95
+ GEO.compare(dataset, field, condition, control, path)
96
+ assert File.exists? path
97
+ end
98
+
99
+ assert GEO[dataset].comparison[GEO.comparison_name field, condition, control].produce.exists?
100
+ end
101
+
102
+
103
+ end
104
+
@@ -0,0 +1,4 @@
1
+ require 'test/unit'
2
+ $LOAD_PATH.unshift(File.expand_path(File.join(File.dirname(__FILE__), '..', 'lib')))
3
+ $LOAD_PATH.unshift(File.expand_path(File.dirname(__FILE__)))
4
+
metadata ADDED
@@ -0,0 +1,87 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: rbbt-GE
3
+ version: !ruby/object:Gem::Version
4
+ hash: 27
5
+ prerelease:
6
+ segments:
7
+ - 0
8
+ - 1
9
+ - 0
10
+ version: 0.1.0
11
+ platform: ruby
12
+ authors:
13
+ - Miguel Vazquez
14
+ autorequire:
15
+ bindir: bin
16
+ cert_chain: []
17
+
18
+ date: 2012-01-19 00:00:00 +01:00
19
+ default_executable:
20
+ dependencies:
21
+ - !ruby/object:Gem::Dependency
22
+ name: rbbt-util
23
+ prerelease: false
24
+ requirement: &id001 !ruby/object:Gem::Requirement
25
+ none: false
26
+ requirements:
27
+ - - ">="
28
+ - !ruby/object:Gem::Version
29
+ hash: 3
30
+ segments:
31
+ - 0
32
+ version: "0"
33
+ type: :runtime
34
+ version_requirements: *id001
35
+ description: Gene Expression in RBBT
36
+ email: miguel.vazquez@cnio.es
37
+ executables: []
38
+
39
+ extensions: []
40
+
41
+ extra_rdoc_files:
42
+ - LICENSE
43
+ files:
44
+ - LICENSE
45
+ - lib/rbbt/GE.rb
46
+ - lib/rbbt/GE/GEO.rb
47
+ - share/install/GEO/Rakefile
48
+ - share/lib/R/MA.R
49
+ - test/test_helper.rb
50
+ - test/rbbt/GE/test_GEO.rb
51
+ has_rdoc: true
52
+ homepage: http://github.com/mikisvaz/rbbt-GE
53
+ licenses: []
54
+
55
+ post_install_message:
56
+ rdoc_options: []
57
+
58
+ require_paths:
59
+ - lib
60
+ required_ruby_version: !ruby/object:Gem::Requirement
61
+ none: false
62
+ requirements:
63
+ - - ">="
64
+ - !ruby/object:Gem::Version
65
+ hash: 3
66
+ segments:
67
+ - 0
68
+ version: "0"
69
+ required_rubygems_version: !ruby/object:Gem::Requirement
70
+ none: false
71
+ requirements:
72
+ - - ">="
73
+ - !ruby/object:Gem::Version
74
+ hash: 3
75
+ segments:
76
+ - 0
77
+ version: "0"
78
+ requirements: []
79
+
80
+ rubyforge_project:
81
+ rubygems_version: 1.6.2
82
+ signing_key:
83
+ specification_version: 3
84
+ summary: Gene Expression in RBBT
85
+ test_files:
86
+ - test/test_helper.rb
87
+ - test/rbbt/GE/test_GEO.rb