rbbt-GE 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
data/LICENSE ADDED
@@ -0,0 +1,20 @@
1
+ Copyright (c) 2010-2011 Miguel Vázquez García
2
+
3
+ Permission is hereby granted, free of charge, to any person obtaining
4
+ a copy of this software and associated documentation files (the
5
+ "Software"), to deal in the Software without restriction, including
6
+ without limitation the rights to use, copy, modify, merge, publish,
7
+ distribute, sublicense, and/or sell copies of the Software, and to
8
+ permit persons to whom the Software is furnished to do so, subject to
9
+ the following conditions:
10
+
11
+ The above copyright notice and this permission notice shall be
12
+ included in all copies or substantial portions of the Software.
13
+
14
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
15
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
16
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
17
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
18
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
19
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
20
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
@@ -0,0 +1,283 @@
1
+ require 'rbbt-util'
2
+ require 'rbbt/GE'
3
+ require 'rbbt/sources/organism'
4
+ require 'rbbt/resource'
5
+ require 'yaml'
6
+
7
+ module GEO
8
+ extend Resource
9
+ self.pkgdir = "geo"
10
+ self.subdir = "arrays"
11
+
12
+ GEO.claim GEO.root.find(:user), :rake, Rbbt.share.install.GEO.Rakefile.find(:lib)
13
+
14
+ def self.comparison_name(field, condition, control)
15
+ condition = condition * " AND " if Array === condition
16
+ control = control * " AND " if Array === control
17
+ [[field, condition] * ": ", [field, control] * ": "] * " => "
18
+ end
19
+
20
+ def self.parse_comparison_name(name)
21
+ field1, condition1, field2, condition2 = name.match(/(.*): (.*?) => (.*?): (.*)/).values_at(1, 2, 3, 4)
22
+ condition1 = condition1.split(/ AND /) if condition1 =~ / AND /
23
+ condition2 = condition2.split(/ AND /) if condition2 =~ / AND /
24
+
25
+ [field1, condition1, field2, condition2]
26
+ end
27
+
28
+ def self.platform_info(platform)
29
+ YAML.load(self[platform]['info.yaml'].produce.read)
30
+ end
31
+
32
+ def self.dataset_info(dataset)
33
+ YAML.load(self[dataset]['info.yaml'].produce.read)
34
+ end
35
+
36
+ def self.is_control?(value, info)
37
+ value.to_s.downcase =~ /\bcontrol\b/ or
38
+ value.to_s.downcase =~ /\bwild/ or
39
+ value.to_s.downcase =~ /\bnone\b/
40
+ end
41
+
42
+ def self.control_samples(dataset)
43
+ info = dataset_info(dataset)
44
+ subsets = info[:subsets]
45
+
46
+ control_samples = []
47
+ subsets.each do |type, values|
48
+ control_samples.concat values.select{|value,samples| is_control? value, info}.collect{|value,samples| samples.split(",")}.flatten
49
+ end
50
+
51
+ control_samples
52
+ end
53
+
54
+ module SOFT
55
+
56
+ GDS_URL="ftp://ftp.ncbi.nih.gov/pub/geo/DATA/SOFT/GDS/#DATASET#.soft.gz"
57
+ GPL_URL="ftp://ftp.ncbi.nih.gov/pub/geo/DATA/SOFT/by_platform/#PLATFORM#/#PLATFORM#_family.soft.gz"
58
+ GSE_URL="ftp://ftp.ncbi.nih.gov/pub/geo/DATA/SOFT/by_series/#SERIES#/#SERIES#_family.soft.gz"
59
+
60
+ GSE_INFO = {
61
+ :title => "!Series_title",
62
+ :channel_count => "!Sample_channel_count",
63
+ :value_type => "!Series_value_type",
64
+ :platform => "!Series_platform_id",
65
+ :description => "!Series_summary*", # Join with \n
66
+ }
67
+
68
+ GDS_INFO = {
69
+ :DELIMITER => "\\^SUBSET",
70
+ :value_type => "!dataset_value_type",
71
+ :channel_count => "!dataset_channel_count",
72
+ :platform => "!dataset_platform",
73
+ :reference_series => "!dataset_reference_series",
74
+ :description => "!dataset_description",
75
+ }
76
+
77
+ GDS_SUBSET_INFO = {
78
+ :DELIMITER => "!subset_.*|!dataset_value_type",
79
+ :description => "!subset_description",
80
+ :samples => "!subset_sample_id*",
81
+ :type => "!subset_type",
82
+ }
83
+
84
+ GPL_INFO = {
85
+ :DELIMITER => "!platform_table_begin",
86
+ :organism => "!Platform_organism",
87
+ :count => "!Platform_data_row_count"
88
+ }
89
+
90
+ # When multiple matches select most common, unless join is choosen
91
+ def self.find_field(header, field, join = false)
92
+ md = header.match(/#{ Regexp.quote field }\s*=\s*(.*)/i)
93
+ return nil if md.nil? or md.captures.empty?
94
+
95
+ case join
96
+ when false, nil
97
+ counts = Hash.new(0)
98
+ md.captures.sort_by{|v| counts[v] += 1}.first
99
+ when true
100
+ md.captures * "\n"
101
+ else
102
+ md.captures * join
103
+ end
104
+ end
105
+
106
+ def self.get_info(header, info)
107
+ result = {}
108
+
109
+ info.each do |key, field|
110
+ next if key == :DELIMITER
111
+ if field =~ /(.*)\*(.*)(\*)?$/
112
+ value = find_field(header, $1, $2.empty? ? true : $2)
113
+ value = value.to_i.to_s == value ? value.to_i : value
114
+ if $3
115
+ result[key] = value.split(',')
116
+ else
117
+ result[key] = value
118
+ end
119
+ else
120
+ value = find_field(header, field, false)
121
+ value = value.to_i.to_s == value ? value.to_i : value
122
+ result[key] = value
123
+ end
124
+ end
125
+
126
+ if result.empty?
127
+ nil
128
+ else
129
+ result
130
+ end
131
+ end
132
+
133
+ def self.parse_header(stream, info)
134
+ header = ""
135
+ while line = stream.readline
136
+ header << line
137
+ break if line =~ /^#{info[:DELIMITER]}/i
138
+ raise "Delimiter not found" if stream.eof?
139
+ end
140
+
141
+ get_info(header, info)
142
+ end
143
+
144
+ def self.guess_id(organism, codes)
145
+ num_codes = codes.size
146
+ best = nil
147
+ best_count = 0
148
+ new_fields = []
149
+ field_counts = {}
150
+ TmpFile.with_file(codes.to_s) do |codefile|
151
+
152
+ codes.all_fields.each_with_index do |field,i|
153
+ values = CMD.cmd("cat #{ codefile }|cut -f #{ i + 1 }| tr '|' '\\n'|grep [[:alpha:]]|sort -u").read.split("\n").reject{|code| code.empty?}
154
+
155
+ new_field, count = Organism.guess_id(organism, values)
156
+ field_counts[new_field] = count
157
+ Log.debug "Original field: #{ field }. New: #{new_field}. Count: #{ count }/#{num_codes}"
158
+ new_fields << (count > (num_codes > 20000 ? 20000 : num_codes).to_f * 0.5 ? new_field : "UNKNOWN(#{ field })")
159
+ if count > best_count
160
+ best = new_field
161
+ best_count = count
162
+ end
163
+
164
+ end
165
+
166
+ end
167
+
168
+ field_counts.delete(new_fields.first)
169
+ [best, new_fields, field_counts.sort_by{|field, counts| counts}.collect{|field, counts| field}.compact]
170
+ end
171
+
172
+ #{{{ GPL
173
+
174
+ def self.GPL(platform, directory)
175
+ FileUtils.mkdir_p directory unless File.exists? directory
176
+
177
+ code_file = File.join(directory, 'codes')
178
+ info_file = File.join(directory, 'info.yaml')
179
+
180
+ stream = Open.open(GPL_URL.gsub('#PLATFORM#', platform), :nocache => true, :pipe => true)
181
+
182
+ info = parse_header(stream, GPL_INFO)
183
+ info[:code_file] = code_file
184
+ info[:data_directory] = directory
185
+
186
+ Log.medium "Producing code file for #{ platform }"
187
+ codes = TSV.open stream, :fix => proc{|l| l =~ /^!platform_table_end/i ? nil : l}, :header_hash => ""
188
+ Log.low "Original fields: #{codes.key_field} - #{codes.fields * ", "}"
189
+ stream.force_close
190
+
191
+ best_field, all_new_fields, order = guess_id(Organism.organism(info[:organism]), codes)
192
+
193
+ new_key_field, *new_fields = all_new_fields
194
+
195
+ new_key_field = codes.key_field if new_key_field =~ /^UNKNOWN/
196
+
197
+ codes.key_field = new_key_field.dup
198
+ codes.fields = new_fields.collect{|f| f.dup}
199
+
200
+ Log.low "New fields: #{codes.key_field} - #{codes.fields * ", "}"
201
+
202
+ Open.write(code_file, codes.reorder(:key, order).to_s(:sort, true))
203
+ Open.write(info_file, info.to_yaml)
204
+
205
+ info
206
+ end
207
+
208
+ def self.dataset_subsets(stream)
209
+ text = ""
210
+ while not (line = stream.gets) =~ /!dataset_table_begin/
211
+ text << line
212
+ end
213
+
214
+ subsets = text.split(/\^SUBSET/).collect do |chunk|
215
+ get_info(chunk, GDS_SUBSET_INFO)
216
+ end
217
+
218
+ info = {}
219
+ subsets.each do |subset|
220
+ type = subset[:type]
221
+ description = subset[:description]
222
+ samples = subset[:samples]
223
+ info[type] ||= {}
224
+ info[type][description] = samples
225
+ end
226
+
227
+ info
228
+ end
229
+
230
+ def self.GDS(dataset, directory)
231
+ FileUtils.mkdir_p directory unless File.exists? directory
232
+
233
+ value_file = File.join(directory, 'values')
234
+ info_file = File.join(directory, 'info.yaml')
235
+
236
+ stream = Open.open(GDS_URL.gsub('#DATASET#', dataset), :nocache => true)
237
+
238
+ info = parse_header(stream, GDS_INFO)
239
+ info[:value_file] = value_file
240
+ info[:data_directory] = directory
241
+
242
+ info[:subsets] = dataset_subsets(stream)
243
+
244
+ Log.medium "Producing values file for #{ dataset }"
245
+ values = TSV.open stream, :fix => proc{|l| l =~ /^!dataset_table_end/i ? nil : l.gsub(/null/,'NA')}, :header_hash => ""
246
+ key_field = TSV.parse_header(GEO[info[:platform]]['codes'].open).key_field
247
+ values.key_field = key_field
248
+
249
+ samples = values.fields.select{|f| f =~ /GSM/}
250
+
251
+ Open.write(value_file, values.slice(samples).to_s(:sort, true))
252
+ Open.write(info_file, info.to_yaml)
253
+
254
+ info
255
+ end
256
+ end
257
+
258
+ def self.compare(dataset, field, condition, control, path)
259
+ dataset_info = GEO[dataset]["info.yaml"].yaml
260
+
261
+ platform = dataset_info[:platform]
262
+ platform_info = GEO[platform]["info.yaml"].yaml
263
+
264
+ log2 = ["count"].include? dataset_info[:value_type]
265
+ samples = dataset_info[:subsets]
266
+ value_file = GEO[dataset].values.find.produce
267
+ format = TSV.parse_header(GEO[platform].codes.open).key_field
268
+
269
+ if Array === condition
270
+ condition_samples = condition.collect{|cond| samples[field][cond].split ","}.flatten
271
+ else
272
+ condition_samples = samples[field][condition].split ","
273
+ end
274
+
275
+ if Array === control
276
+ control_samples = control.collect{|cond| samples[field][cond].split ","}.flatten
277
+ else
278
+ control_samples = samples[field][control].split ","
279
+ end
280
+
281
+ GE.analyze(value_file, condition_samples, control_samples, log2, path, format)
282
+ end
283
+ end
data/lib/rbbt/GE.rb ADDED
@@ -0,0 +1,36 @@
1
+ require 'rbbt/util/R'
2
+
3
+ module GE
4
+ LIB_DIR = File.join(File.expand_path(File.dirname(__FILE__)),'../../share/lib/R')
5
+ MA = File.join(LIB_DIR, 'MA.R')
6
+
7
+ def self.run_R(command)
8
+ cmd = "source('#{MA}');" << command
9
+ R.run(cmd)
10
+ end
11
+
12
+ def self.r_format(list)
13
+ case
14
+ when list.nil?
15
+ "NULL"
16
+ when Array === list
17
+ "c(#{list.collect{|e| r_format e} * ", "})"
18
+ when (String === list and list === list.to_i.to_s)
19
+ list.to_i
20
+ when (String === list and list === list.to_f.to_s)
21
+ list.to_f
22
+ when TrueClass === list
23
+ "TRUE"
24
+ when FalseClass === list
25
+ "FALSE"
26
+ else
27
+ "'#{list.to_s}'"
28
+ end
29
+ end
30
+
31
+ def self.analyze(datafile, main, contrast = nil, log2 = false, outfile = nil, key_field = nil)
32
+ FileUtils.mkdir_p File.dirname(outfile) unless outfile.nil? or File.exists? File.dirname(outfile)
33
+ GE.run_R("rbbt.GE.process(#{ r_format datafile }, main = #{r_format(main)}, contrast = #{r_format(contrast)}, log2=#{ r_format log2 }, outfile = #{r_format outfile}, key.field = #{r_format key_field})")
34
+ end
35
+ end
36
+
@@ -0,0 +1,24 @@
1
+
2
+ rule /^(GPL\d+)\/?(codes|info\.yaml)?$/ do |t|
3
+ t.name =~ /^(GPL\d+)\/?(codes|info\.yaml)?/
4
+ platform = $1
5
+ file = $2
6
+ GEO::SOFT.GPL(platform, file.nil? ? t.name : File.dirname(t.name))
7
+ end
8
+
9
+ rule /^(GDS\d+)\/?(values|info\.yaml)?$/ do |t|
10
+ t.name =~ /^(GDS\d+)\/?(values|info\.yaml)?/
11
+ dataset = $1
12
+ file = $2
13
+ GEO::SOFT.GDS(dataset, file.nil? ? t.name : File.dirname(t.name))
14
+ end
15
+
16
+ rule /^(GDS\d+)\/comparison\/(.*)$/ do |t|
17
+ t.name =~ /^(GDS\d+)\/comparison\/(.*)/
18
+ dataset = $1
19
+ name = $2
20
+
21
+ condition_field, condition_name, control_field, control_name = GEO.parse_comparison_name name
22
+
23
+ GEO.compare(dataset, condition_field, condition_name, control_name, t.name)
24
+ end
data/share/lib/R/MA.R ADDED
@@ -0,0 +1,515 @@
1
+ library(limma)
2
+
3
+ #########################################################################
4
+ # Model processing
5
+
6
+ # Ratio
7
+ rbbt.GE.process.ratio.oneside <- function(expr){
8
+ ratio = apply(expr, 1 ,function(x){mean(x, na.rm = TRUE)})
9
+ names(ratio) <- rownames(expr);
10
+ return(ratio);
11
+ }
12
+
13
+ rbbt.GE.process.ratio.twoside <- function(expr, contrast){
14
+ ratio = rbbt.GE.process.ratio.oneside(expr) - rbbt.GE.process.ratio.oneside(contrast)
15
+ names(ratio) <- rownames(expr);
16
+ return(ratio);
17
+ }
18
+
19
+ # Limma
20
+ rbbt.GE.process.limma.oneside <- function(expr, subset = NULL){
21
+
22
+ if (is.null(subset)){
23
+ fit <- lmFit(expr);
24
+ }else{
25
+ design = rep(0, dim(expr)[2]);
26
+ design[names(expr) %in% subset] = 1;
27
+ }
28
+
29
+ fit <- lmFit(expr, design);
30
+
31
+ fit <- eBayes(fit);
32
+
33
+ sign = fit$t < 0;
34
+ sign[is.na(sign)] = FALSE;
35
+ fit$p.value[sign] = - fit$p.value[sign];
36
+
37
+ return(list(t= fit$t, p.values= fit$p.value));
38
+ }
39
+
40
+ rbbt.GE.process.limma.twoside <- function(expr, subset.main, subset.contrast){
41
+
42
+ design = cbind(rep(1,dim(expr)[2]), rep(0,dim(expr)[2]));
43
+ colnames(design) <-c('intercept', 'expr');
44
+ design[names(expr) %in% subset.main,] = 1;
45
+ design[names(expr) %in% subset.contrast,'intercept'] = 1;
46
+
47
+ fit <- lmFit(expr, design);
48
+
49
+ fit <- eBayes(fit);
50
+ sign = fit$t[,2] < 0;
51
+ sign[is.na(sign)] = FALSE;
52
+ fit$p.value[sign,2] = - fit$p.value[sign,2];
53
+
54
+ return(list(t= fit$t[,2], p.values= fit$p.value[,2]));
55
+ }
56
+
57
+
58
+
59
+ rbbt.GE.process <- function(file, main, contrast = NULL, log2 = FALSE, outfile = NULL, key.field = NULL){
60
+ data = rbbt.tsv(file);
61
+ ids = rownames(data);
62
+
63
+ if (log2){
64
+ data = log2(data);
65
+ }
66
+
67
+ if (is.null(contrast)){
68
+ ratio = rbbt.GE.process.ratio.oneside(subset(data, select=main));
69
+ }else{
70
+ ratio = rbbt.GE.process.ratio.twoside(subset(data, select=main), subset(data, select=contrast) );
71
+ }
72
+
73
+ if (is.null(contrast)){
74
+ limma = NULL;
75
+ tryCatch({
76
+ limma = rbbt.GE.process.limma.oneside(data, main);
77
+ }, error=function(x){
78
+ cat("Limma failed for complete dataset. Trying just subset.\n", file=stderr());
79
+ print(x, file=stderr());
80
+ tryCatch({
81
+ limma = rbbt.GE.process.limma.oneside(subset(data, select=main));
82
+ }, error=function(x){
83
+ cat("Limma failed for subset dataset.\n", file=stderr());
84
+ print(x, file=stderr());
85
+ });
86
+ })
87
+ }else{
88
+ limma = NULL;
89
+ tryCatch({
90
+ limma = rbbt.GE.process.limma.twoside(data, main, contrast);
91
+ }, error=function(x){
92
+ cat("Limma failed for complete dataset. Trying just subset.\n", file=stderr());
93
+ print(x, file=stderr());
94
+ tryCatch({
95
+ limma = rbbt.GE.process.limma.twoside(subset(data, select=c(main, contrast)), main, contrast);
96
+ }, error=function(x){
97
+ cat("Limma failed for subset dataset.\n", file=stderr());
98
+ print(x, file=stderr());
99
+ });
100
+ })
101
+
102
+ }
103
+
104
+ if (! is.null(limma)){
105
+ result = data.frame(ratio = ratio[ids], t.values = limma$t[ids], p.values = limma$p.values[ids])
106
+ }else{
107
+ result = data.frame(ratio = ratio)
108
+ }
109
+
110
+ if (is.null(outfile)){
111
+ return(result);
112
+ }else{
113
+ rbbt.tsv.write(outfile, result, key.field, ":type=:list#:cast=:to_f");
114
+ return(NULL);
115
+ }
116
+ }
117
+
118
+
119
+
120
+ ############################################################################
121
+ ############################################################################
122
+ ############################################################################
123
+ ############################################################################
124
+ ############################################################################
125
+ # OLD STUFF
126
+
127
+
128
+ MA.get_order <- function(values){
129
+ orders = values;
130
+ orders[,] = NA;
131
+
132
+ for (i in 1:dim(values)[2]){
133
+ positions = names(sort(values[,i],decreasing=T,na.last=NA));
134
+ orders[,i] = NA;
135
+ orders[positions,i] = 1:length(positions)
136
+ }
137
+ orders
138
+ }
139
+
140
+ MA.guess.do.log2 <- function(m, two.channel){
141
+ if (two.channel){
142
+ return (sum(m < 0, na.rm = TRUE) == 0);
143
+ }else{
144
+ return (max(m, na.rm = TRUE) > 100);
145
+ }
146
+ }
147
+
148
+ MA.translate <- function(m, trans){
149
+ trans[trans==""] = NA;
150
+ trans[trans=="NO MATCH"] = NA;
151
+
152
+ missing = length(trans) - dim(m)[1];
153
+
154
+ # If extra genes
155
+ if (missing < 0){
156
+ trans = c(trans,rep(NA, - missing));
157
+ missing = 0;
158
+ }
159
+ n = apply(m,2,function(x){
160
+ # Complete data with missing genes
161
+ x.complete = c(x,rep(NA, missing));
162
+ tapply(x.complete, factor(trans), median)
163
+ });
164
+ n[sort(rownames(n),index.return=T)$ix,]
165
+ }
166
+
167
+ # Conditions
168
+
169
+ MA.conditions.has_control <- function(x){
170
+ keywords = c('none', 'control', 'normal', 'wild', 'baseline', 'untreat', 'uninfected', 'universal', 'reference', 'vehicle', 'w.t.','wt');
171
+ for(keyword in keywords){
172
+ control = grep(keyword, x, ignore.case = TRUE);
173
+ if (any(control)){
174
+ return(x[control[1]]);
175
+ }
176
+ }
177
+ return(NULL)
178
+ }
179
+
180
+ MA.condition.values <- function(values){
181
+ control = MA.conditions.has_control(values);
182
+
183
+ values.factor = factor(values);
184
+ values.levels = levels(values.factor);
185
+
186
+ # If there is a control state remove it from sorting
187
+ if (!is.null(control))
188
+ values.levels = values.levels[values.levels != control];
189
+
190
+
191
+ # Use numeric sort if they all have numbers
192
+ if (length(grep('^ *[0-9]+',values.levels,perl=TRUE)) == length(values.levels)){
193
+ ix = sort(as.numeric(sub('^ *([0-9]+).*',"\\1",values.levels)), decreasing = T, index.return = TRUE)$ix
194
+ }else{
195
+ ix = sort(values.levels, decreasing = T, index.return = TRUE)$ix
196
+ }
197
+
198
+ return(list(values = values.levels[ix], control = control));
199
+ }
200
+
201
+
202
+ #########################################################################
203
+ # Model processing
204
+
205
+ # Ratio
206
+ MA.ratio.two_channel <- function(m, conditions, main){
207
+ main = m[,conditions==main];
208
+ if (!is.null(dim(main))){
209
+ main = apply(main, 1 ,function(x){mean(x, na.rm = TRUE)});
210
+ }
211
+ return(main);
212
+ }
213
+
214
+ MA.ratio.contrast <- function(m, conditions, main, contrast){
215
+ main = m[,conditions==main];
216
+ if (!is.null(dim(main))){
217
+ main = apply(main, 1 ,function(x){mean(x, na.rm = TRUE)});
218
+ }
219
+
220
+ contrast = m[,conditions==contrast];
221
+ if (!is.null(dim(contrast))){
222
+ contrast = apply(contrast, 1 ,function(x){mean(x, na.rm = TRUE)});
223
+ }
224
+
225
+ return (main - contrast);
226
+ }
227
+
228
+
229
+ # Limma
230
+
231
+ MA.limma.two_channel <- function(m, conditions, main){
232
+ if (sum(conditions == main) < 3){
233
+ return(NULL);
234
+ }
235
+
236
+ design = rep(0,dim(m)[2]);
237
+ design[conditions == main] = 1;
238
+
239
+ # We need to subset the columns because of a problem with NA values. This
240
+ # might affect eBayes variance estimations, thats my guess anyway...
241
+
242
+ fit <- lmFit(m[,design == 1],rep(1, sum(design)));
243
+
244
+ tryCatch({
245
+ fit <- eBayes(fit);
246
+ sign = fit$t < 0;
247
+ sign[is.na(sign)] = FALSE;
248
+ fit$p.value[sign] = - fit$p.value[sign];
249
+ return(list(t= fit$t, p.values= fit$p.value));
250
+ }, error=function(x){
251
+ print("Exception caught in eBayes");
252
+ print(x);
253
+ })
254
+
255
+ return(NULL);
256
+ }
257
+
258
+ MA.limma.contrast <- function(m, conditions, main, contrast){
259
+ if (sum(conditions == main) + sum(conditions == contrast) < 3){
260
+ return(NULL);
261
+ }
262
+ m = cbind(m[,conditions == main],m[,conditions == contrast]);
263
+
264
+ design = cbind(rep(1,dim(m)[2]), rep(0,dim(m)[2]));
265
+ colnames(design) <-c('intercept', 'main');
266
+ design[1:sum(conditions==main),2] = 1;
267
+
268
+
269
+ fit <- lmFit(m,design);
270
+ tryCatch({
271
+ fit <- eBayes(fit);
272
+ sign = fit$t[,2] < 0;
273
+ sign[is.na(sign)] = FALSE;
274
+ fit$p.value[sign,2] = - fit$p.value[sign,2]
275
+ return(list(t= fit$t[,2], p.values= fit$p.value[,2] ));
276
+ }, error=function(x){
277
+ print("Exception caught in eBayes");
278
+ print(x);
279
+ })
280
+
281
+ return(NULL);
282
+ }
283
+
284
+
285
+ #########################################################################
286
+ # Process conditions
287
+
288
+ MA.strip_blanks <- function(text){
289
+ text = sub(' *$', '' ,text);
290
+ text = sub('^ *', '' ,text);
291
+
292
+ return(text);
293
+ }
294
+
295
+ MA.orders <- function(ratios, t){
296
+ best = vector();
297
+ names = vector();
298
+ for (name in colnames(ratios)){
299
+ if (sum(colnames(t) == name) > 0){
300
+ best = cbind(best, t[,name]);
301
+ names = c(names, name);
302
+ }else{
303
+ best = cbind(best, ratios[,name]);
304
+ names = c(names, paste(name,'[ratio]', sep=" "));
305
+ }
306
+ }
307
+ rownames(best) <- rownames(ratios);
308
+ orders <- as.data.frame(MA.get_order(best));
309
+ colnames(orders) <- names;
310
+
311
+ return(orders);
312
+ }
313
+
314
+ MA.process_conditions.contrasts <- function(m, conditions, two.channel){
315
+ max_levels = 10;
316
+ max_levels_control = 1;
317
+
318
+
319
+ values = MA.condition.values(conditions);
320
+
321
+
322
+ ratios = vector();
323
+ t = vector();
324
+ p.values = vector();
325
+
326
+ ratio_names = vector();
327
+ t_names = vector();
328
+
329
+ if (!is.null(values$control)){
330
+ contrast = values$control;
331
+ for (main in values$values){
332
+ name = paste(main, contrast, sep = " <=> ")
333
+
334
+ ratio = MA.ratio.contrast(m, conditions, main, contrast);
335
+ ratio_names = c(ratio_names, name);
336
+ ratios = cbind(ratios, ratio);
337
+
338
+ res = MA.limma.contrast(m, conditions, main, contrast);
339
+ if (!is.null(res)){
340
+ t_names = c(t_names, name);
341
+ t = cbind(t, res$t);
342
+ p.values = cbind(p.values, res$p.values);
343
+ }
344
+ }
345
+ }
346
+
347
+
348
+ if (length(values$values) <= max_levels_control || (is.null(values$control) && !two.channel && length(values$values) <= max_levels )){
349
+
350
+ remaining = values$values;
351
+ for (main in values$values){
352
+ remaining = remaining[remaining != main];
353
+ for (contrast in remaining){
354
+ name = paste(main, contrast, sep = " <=> ");
355
+
356
+ ratio = MA.ratio.contrast(m, conditions, main, contrast);
357
+ ratio_names = c(ratio_names, name);
358
+ ratios = cbind(ratios, ratio);
359
+
360
+ res = MA.limma.contrast(m, conditions, main, contrast);
361
+ if (!is.null(res)){
362
+ t_names = c(t_names, name);
363
+ t = cbind(t, res$t);
364
+ p.values = cbind(p.values, res$p.values);
365
+ }
366
+ }
367
+ }
368
+ }
369
+
370
+
371
+ if (length(ratio_names) != 0){
372
+ ratio_names = as.vector(sapply(ratio_names, MA.strip_blanks));
373
+ colnames(ratios) <- ratio_names
374
+ }
375
+
376
+ if (length(t_names) != 0){
377
+ t_names = as.vector(sapply(t_names, MA.strip_blanks));
378
+ colnames(t) <- t_names;
379
+ colnames(p.values) <- t_names;
380
+ }
381
+
382
+
383
+ return(list(ratios = ratios, t=t, p.values = p.values));
384
+ }
385
+
386
+ MA.process_conditions.two_channel <- function(m, conditions){
387
+ values = MA.condition.values(conditions);
388
+
389
+ all_values = values$values;
390
+ if (!is.null(values$control)){
391
+ all_values = c(all_values, values$control);
392
+ }
393
+
394
+
395
+ ratios = vector();
396
+ t = vector();
397
+ p.values = vector();
398
+
399
+ ratio_names = vector();
400
+ t_names = vector();
401
+
402
+
403
+ for (main in all_values){
404
+ name = main;
405
+
406
+ ratio = MA.ratio.two_channel(m, conditions, main);
407
+ ratio_names = c(ratio_names, name);
408
+ ratios = cbind(ratios, ratio);
409
+
410
+ res = MA.limma.two_channel(m, conditions, main);
411
+ if (!is.null(res)){
412
+ t_names = c(t_names, name);
413
+ t = cbind(t, res$t);
414
+ p.values = cbind(p.values, res$p.values);
415
+ }
416
+ }
417
+
418
+ if (length(ratio_names) != 0){
419
+ ratio_names = as.vector(sapply(ratio_names, MA.strip_blanks));
420
+ colnames(ratios) <- ratio_names
421
+ }
422
+
423
+ if (length(t_names) != 0){
424
+ t_names = as.vector(sapply(t_names, MA.strip_blanks));
425
+ colnames(t) <- t_names;
426
+ colnames(p.values) <- t_names;
427
+ }
428
+
429
+ return(list(ratios = ratios, t=t, p.values = p.values));
430
+ }
431
+
432
+
433
+
434
+ # Process microarray matrix
435
+
436
+ MA.process <- function(m, conditions_list, two.channel = FALSE){
437
+
438
+ ratios = vector();
439
+ t = vector();
440
+ p.values = vector();
441
+
442
+ for(type in colnames(conditions_list)){
443
+ conditions = conditions_list[,type]
444
+
445
+ if (two.channel){
446
+ res = MA.process_conditions.two_channel(m, conditions);
447
+ if (length(res$ratios) != 0){ colnames(res$ratios) <- sapply(colnames(res$ratios),function(x){paste(type,x,sep=": ")}); ratios = cbind(ratios,res$ratios);}
448
+ if (length(res$t) != 0){ colnames(res$t) <- sapply(colnames(res$t),function(x){paste(type,x,sep=": ")}); t = cbind(t,res$t);}
449
+ if (length(res$p.values) != 0){ colnames(res$p.values) <- sapply(colnames(res$p.values),function(x){paste(type,x,sep=": ")}); p.values = cbind(p.values,res$p.values);}
450
+ }
451
+
452
+ res = MA.process_conditions.contrasts(m, conditions, two.channel);
453
+ if (length(res$ratios) != 0){ colnames(res$ratios) <- sapply(colnames(res$ratios),function(x){paste(type,x,sep=": ")}); ratios = cbind(ratios,res$ratios);}
454
+ if (length(res$t) != 0){ colnames(res$t) <- sapply(colnames(res$t),function(x){paste(type,x,sep=": ")}); t = cbind(t,res$t);}
455
+ if (length(res$p.values) != 0){ colnames(res$p.values) <- sapply(colnames(res$p.values),function(x){paste(type,x,sep=": ")}); p.values = cbind(p.values,res$p.values);}
456
+ }
457
+
458
+ orders <- MA.orders(ratios,t);
459
+ return(list(ratios = ratios, t=t, p.values = p.values, orders=orders));
460
+ }
461
+
462
+
463
+ MA.save <- function(prefix, orders, ratios, t , p.values, experiments, description = NULL) {
464
+ if (is.null(orders)){
465
+ cat("No suitable samples for analysis\n")
466
+ write(file=paste(prefix,'skip',sep="."), "No suitable samples for analysis" );
467
+ } else {
468
+ write.table(file=paste(prefix,'orders',sep="."), orders, sep="\t", row.names=F, col.names=F, quote=F);
469
+ write.table(file=paste(prefix,'codes',sep="."), rownames(orders), sep="\t", row.names=F, col.names=F, quote=F);
470
+ write.table(file=paste(prefix,'logratios',sep="."), ratios, sep="\t", row.names=F, col.names=F, quote=F);
471
+ write.table(file=paste(prefix,'t',sep="."), t, sep="\t", row.names=F, col.names=F, quote=F);
472
+ write.table(file=paste(prefix,'pvalues',sep="."), p.values, sep="\t", row.names=F, col.names=F, quote=F);
473
+ write.table(file=paste(prefix,'experiments',sep="."), experiments, sep="\t", row.names=F, col.names=F, quote=F);
474
+
475
+ write(file=paste(prefix,'description',sep="."), description)
476
+ }
477
+ }
478
+
479
+ MA.load <- function(prefix, orders = TRUE, logratios = TRUE, t = TRUE, p.values = TRUE){
480
+ data = list();
481
+ genes <- scan(file=paste(prefix,'codes',sep="."),sep="\n",quiet=T,what=character());
482
+ experiments <- scan(file=paste(prefix,'experiments',sep="."),sep="\n",quiet=T,what=character());
483
+
484
+ experiments.no.ratio = experiments[- grep('ratio', experiments)];
485
+
486
+ if (orders){
487
+ orders <- read.table(file=paste(prefix,'orders',sep="."),sep="\t");
488
+ rownames(orders) <- genes;
489
+ colnames(orders) <- experiments;
490
+ data$orders=orders;
491
+ }
492
+ if (logratios){
493
+ logratios <- read.table(file=paste(prefix,'logratios',sep="."),sep="\t");
494
+ rownames(logratios) <- genes;
495
+ colnames(logratios) <- experiments;
496
+ data$logratios=logratios;
497
+ }
498
+ if (t){
499
+ t <- read.table(file=paste(prefix,'t',sep="."),sep="\t");
500
+ rownames(t) <- genes;
501
+ colnames(t) <- experiments.no.ratio;
502
+ data$t=t;
503
+ }
504
+ if (p.values){
505
+ p.values <- read.table(file=paste(prefix,'pvalues',sep="."),sep="\t");
506
+ rownames(p.values) <- genes;
507
+ colnames(p.values) <- experiments.no.ratio;
508
+ data$p.values=p.values;
509
+ }
510
+
511
+
512
+ return(data);
513
+
514
+
515
+ }
@@ -0,0 +1,104 @@
1
+ require File.join(File.expand_path(File.dirname(__FILE__)), '../..', 'test_helper.rb')
2
+ require 'rbbt/GE/GEO'
3
+
4
+ class TestClass < Test::Unit::TestCase
5
+
6
+ def test_control_sample
7
+ assert GEO.control_samples('GDS750').include? "GSM16978"
8
+ end
9
+
10
+ def test_GDS
11
+ assert_equal 'GPL999', GEO.dataset_info('GDS750')[:platform]
12
+ end
13
+
14
+ def test_GPL
15
+ assert_equal 'Saccharomyces cerevisiae', GEO["GPL999/info.yaml"].yaml[:organism]
16
+ assert_equal 'Homo sapiens', GEO["GPL570/info.yaml"].yaml[:organism]
17
+ assert GEO.GPL999.codes.fields.include? "Ensembl Gene ID"
18
+ end
19
+
20
+ def test_normalize
21
+ dataset = 'GDS750'
22
+ gene = "YPR191W"
23
+ id = "6079"
24
+
25
+ platform = GEO.GDS(dataset)[:platform]
26
+ translated = GEO.normalize(platform, ["YPR191W"]).first.first
27
+
28
+ assert_equal id, translated
29
+ end
30
+
31
+ def test_analyze_single
32
+ dataset = 'GDS750'
33
+ info = GEO.GDS(dataset)
34
+
35
+ assert GE.analyze(info[:data_file], info[:subsets]["agent"]["tunicamycin"] ).read =~ /1234/;
36
+ end
37
+
38
+ def test_analyze_contrast
39
+ dataset = 'GDS750'
40
+ info = GEO.GDS(dataset)
41
+ outfile = File.join(File.dirname(info[:data_file]), 'results')
42
+ key_field = TSV.headers(GEO.GPL(info[:platform])[:code_file]).first
43
+
44
+ TmpFile.with_file do |f|
45
+ GE.analyze(info[:data_file], info[:subsets]["agent"]["tunicamycin"], info[:subsets]["agent"]["DTT"], false, f, key_field);
46
+ assert File.exists? f
47
+ FileUtils.rm f
48
+ end
49
+ end
50
+
51
+ def test_process_subset
52
+ dataset = 'GDS750'
53
+ subset = 'agent'
54
+ id = "6079"
55
+ info = GEO.GDS(dataset)
56
+ outfile = File.join(File.dirname(info[:data_file]), 'results')
57
+ key_field = TSV.headers(GEO.GPL(info[:platform])[:code_file]).first
58
+
59
+ TmpFile.with_file do |f|
60
+ GEO.process_subset(dataset, subset, nil, f)
61
+ assert File.exists? f
62
+ FileUtils.rm f
63
+ end
64
+
65
+ t = GEO.process_subset(dataset, subset, 'tunicamycin')
66
+ assert File.exists? File.join(File.dirname(info[:data_file]), 'analyses/subset.agent.tunicamycin')
67
+ d = GEO.process_subset(dataset, subset, 'DTT')
68
+ assert File.exists? File.join(File.dirname(info[:data_file]), 'analyses/subset.agent.DTT')
69
+
70
+ assert_in_delta t[id]["p.values"], - d[id]["p.values"], 0.0001
71
+ end
72
+
73
+ def test_GSE
74
+ gse="GSE966"
75
+ info = GEO.GSE(gse)
76
+ assert_equal "GPL764", info[:platform]
77
+ end
78
+
79
+
80
+ #{{{ NEW TEST
81
+
82
+ def test_GSE
83
+ gse="GSE966"
84
+ info = GEO.GSE(gse)
85
+ assert_equal "GPL764", info[:platform]
86
+ end
87
+
88
+ def test_compare
89
+ dataset = "GDS1479"
90
+ field = "specimen"
91
+ condition = "carcinoma in situ lesion"
92
+ control = "normal mucosa"
93
+
94
+ TmpFile.with_file do |path|
95
+ GEO.compare(dataset, field, condition, control, path)
96
+ assert File.exists? path
97
+ end
98
+
99
+ assert GEO[dataset].comparison[GEO.comparison_name field, condition, control].produce.exists?
100
+ end
101
+
102
+
103
+ end
104
+
@@ -0,0 +1,4 @@
1
+ require 'test/unit'
2
+ $LOAD_PATH.unshift(File.expand_path(File.join(File.dirname(__FILE__), '..', 'lib')))
3
+ $LOAD_PATH.unshift(File.expand_path(File.dirname(__FILE__)))
4
+
metadata ADDED
@@ -0,0 +1,87 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: rbbt-GE
3
+ version: !ruby/object:Gem::Version
4
+ hash: 27
5
+ prerelease:
6
+ segments:
7
+ - 0
8
+ - 1
9
+ - 0
10
+ version: 0.1.0
11
+ platform: ruby
12
+ authors:
13
+ - Miguel Vazquez
14
+ autorequire:
15
+ bindir: bin
16
+ cert_chain: []
17
+
18
+ date: 2012-01-19 00:00:00 +01:00
19
+ default_executable:
20
+ dependencies:
21
+ - !ruby/object:Gem::Dependency
22
+ name: rbbt-util
23
+ prerelease: false
24
+ requirement: &id001 !ruby/object:Gem::Requirement
25
+ none: false
26
+ requirements:
27
+ - - ">="
28
+ - !ruby/object:Gem::Version
29
+ hash: 3
30
+ segments:
31
+ - 0
32
+ version: "0"
33
+ type: :runtime
34
+ version_requirements: *id001
35
+ description: Gene Expression in RBBT
36
+ email: miguel.vazquez@cnio.es
37
+ executables: []
38
+
39
+ extensions: []
40
+
41
+ extra_rdoc_files:
42
+ - LICENSE
43
+ files:
44
+ - LICENSE
45
+ - lib/rbbt/GE.rb
46
+ - lib/rbbt/GE/GEO.rb
47
+ - share/install/GEO/Rakefile
48
+ - share/lib/R/MA.R
49
+ - test/test_helper.rb
50
+ - test/rbbt/GE/test_GEO.rb
51
+ has_rdoc: true
52
+ homepage: http://github.com/mikisvaz/rbbt-GE
53
+ licenses: []
54
+
55
+ post_install_message:
56
+ rdoc_options: []
57
+
58
+ require_paths:
59
+ - lib
60
+ required_ruby_version: !ruby/object:Gem::Requirement
61
+ none: false
62
+ requirements:
63
+ - - ">="
64
+ - !ruby/object:Gem::Version
65
+ hash: 3
66
+ segments:
67
+ - 0
68
+ version: "0"
69
+ required_rubygems_version: !ruby/object:Gem::Requirement
70
+ none: false
71
+ requirements:
72
+ - - ">="
73
+ - !ruby/object:Gem::Version
74
+ hash: 3
75
+ segments:
76
+ - 0
77
+ version: "0"
78
+ requirements: []
79
+
80
+ rubyforge_project:
81
+ rubygems_version: 1.6.2
82
+ signing_key:
83
+ specification_version: 3
84
+ summary: Gene Expression in RBBT
85
+ test_files:
86
+ - test/test_helper.rb
87
+ - test/rbbt/GE/test_GEO.rb