rbbt-marq 1.0.9 → 1.1.0

Sign up to get free protection for your applications and to get access to all the features.
data/R/GEO.R CHANGED
@@ -1,106 +1,8 @@
1
1
  library(Biobase);
2
2
  library(GEOquery);
3
3
 
4
-
5
- GEO.path <- function(dataset, cross_platform = FALSE, datadir = NULL){
6
- if (is.null(datadir) && exists('MARQ.config')){
7
- datadir= paste(MARQ.config$datadir, 'GEO', sep="/");
8
- }
9
-
10
- if (is.null(datadir)){
11
- print("No datadir specified and no default found (MARQ.config$datadir");
12
- exit(-1);
13
- }
14
-
15
- if ( length(grep('_cross_platform', dataset)) == 0 && cross_platform){
16
- dataset = paste(dataset, '_cross_platform', sep = "");
17
-
18
- }
19
-
20
-
21
- files = Sys.glob(paste(datadir,'*', '*', paste(dataset, 'orders', sep="."), sep="/"));
22
-
23
- if (length(files) == 0){
24
- return(NULL);
25
- }
26
- else{
27
- return(sub('.orders','', files[1]));
28
- }
29
- }
30
-
31
- GEO.platform <- function(dataset, datadir = NULL){
32
-
33
- path = GEO.path(dataset, datadir = datadir);
34
-
35
- if (is.null(path)){ return(NULL);}
36
-
37
- return(sub(".*(GPL\\d+).*","\\1", path, perl = TRUE));
38
- }
39
-
40
- GEO.platform.path <- function(platform, datadir = NULL){
41
- if (is.null(datadir) && exists('MARQ.config')){
42
- datadir= paste(MARQ.config$datadir, 'GEO', sep="/");
43
- }
44
-
45
- if (is.null(datadir)){
46
- print("No datadir specified and no default found (MARQ.config$datadir");
47
- exit(-1);
48
- }
49
-
50
- return(paste(datadir, platform, sep="/"));
51
- }
52
-
53
- GEO.platform.datasets <- function(platform, cross_platform = TRUE, series = TRUE, datadir = NULL){
54
- if (cross_platform){
55
- cp.suffix = '_cross_platform'
56
- }
57
- else{
58
- cp.suffix = ''
59
- }
60
-
61
- if (series){
62
- pattern = '*'
63
- }
64
- else{
65
- pattern = 'GDS'
66
- }
67
- files = Sys.glob(paste(GEO.platform.path(platform, datadir), pattern, paste('*',cp.suffix,'.orders',sep=""),sep="/"))
68
-
69
- return(sapply(files, function(path){ sub(".*((?:GDS|GSE)\\d+).*", '\\1', path, perl=TRUE)}, USE.NAMES = FALSE));
70
- }
71
-
72
- GEO.values <- function(data){
73
- values <- MA.process(data$m, data$conditions, data$two.channel)
74
-
75
- if (length(values$ratios) == 0){
76
- return(NULL);
77
- }else{
78
- ratios = as.data.frame(values$ratios);
79
- t = as.data.frame(values$t);
80
- p.values = as.data.frame(values$p.values);
81
-
82
-
83
- # Calculate orders from best information
84
- best = vector();
85
- names = vector();
86
- for (name in colnames(ratios)){
87
- if (sum(colnames(t) == name) > 0){
88
- best = cbind(best, t[,name]);
89
- names = c(names, name);
90
- }else{
91
- best = cbind(best, ratios[,name]);
92
- names = c(names, paste(name,'[ratio]', sep=" "));
93
- }
94
- }
95
- rownames(best) <- rownames(ratios)
96
- orders = as.data.frame(MA.get_order(best));
97
- colnames(orders) <- names
98
-
99
- return(list(ratios = ratios, t = t, p.values = p.values, orders = orders));
100
- }
101
- }
102
-
103
-
4
+ ####################################################
5
+ # Data retrieval functions
104
6
 
105
7
  GEO.get <- function(name, cachedir = NULL){
106
8
  if (is.null(cachedir) && exists('MARQ.config')){
@@ -161,7 +63,10 @@ GEO.GDS.data <- function(name, id.field = NULL, translation.file = NULL, cachedi
161
63
  if (!is.null(id.field)){
162
64
  trans = featureData(eSet)[[id.field]];
163
65
  }
164
- if (!is.null(translation.file)){
66
+ if (!is.null(translation.file) && translation.file != FALSE){
67
+ if (translation.file == TRUE && exists(MARQ.platform.path)){
68
+ translation.file = paste(MARQ.platform.path(gpl_name), 'translations', sep="/");
69
+ }
165
70
  trans = scan(file=translation.file,what=character(),sep="\n",quiet=T);
166
71
  }
167
72
 
@@ -169,33 +74,12 @@ GEO.GDS.data <- function(name, id.field = NULL, translation.file = NULL, cachedi
169
74
  m <- MA.translate(m, trans);
170
75
  }
171
76
 
172
- return (list(conditions = conditions, m = m, two.channel = two.channel, description = description))
173
- }
174
77
 
175
78
 
176
- GEO.GDS.process <- function(name, prefix, id.field = NULL, translation.file = NULL,cachedir=NULL){
177
- tryCatch(
178
- {
179
- gds.data = GEO.GDS.data(name, id.field, translation.file, cachedir)
180
- values = GEO.values(gds.data)
181
- if (is.null(values)){
182
- write(file=paste(prefix,'skip',sep="."), "No suitable samples for analysis" );
183
- }else{
184
- MA.save(prefix, values$orders, values$ratios, values$t, values$p.values, colnames(values$orders), gds.data$description);
185
- }
186
- }
187
- ,
188
- error=function(x){
189
- print("Exception caught");
190
- print(x);
191
- write(file=paste(prefix,'skip',sep="."), paste("An exception was caught during the analysis.",x,sep="\n") );
192
- }
193
- )
79
+ return (list(conditions = conditions, m = m, two.channel = two.channel, description = description))
194
80
  }
195
81
 
196
82
 
197
-
198
-
199
83
  GEO.GSE.data <- function(gsms, conditions, do.log2 = NULL, translation.file = NULL, use.fields = NULL, cachedir = NULL){
200
84
 
201
85
  c = sapply(conditions,function(x){x});
@@ -261,7 +145,10 @@ GEO.GSE.data <- function(gsms, conditions, do.log2 = NULL, translation.file = NU
261
145
  }
262
146
 
263
147
  trans = NULL
264
- if (!is.null(translation.file)){
148
+ if (!is.null(translation.file) && translation.file != FALSE){
149
+ if (translation.file == TRUE && exists(MARQ.platform.path)){
150
+ translation.file = paste(MARQ.platform.path(gpl_name), 'translations', sep="/");
151
+ }
265
152
  trans = read.table(file=translation.file, sep="\t",header=F)[,1];
266
153
  }
267
154
  if (!is.null(trans)){
@@ -282,6 +169,63 @@ GEO.GSE.data <- function(gsms, conditions, do.log2 = NULL, translation.file = NU
282
169
  return (list(conditions = conditions, m = m, two.channel = two.channel))
283
170
  }
284
171
 
172
+
173
+ ####################################################
174
+ # Processing Functions
175
+
176
+ GEO.values <- function(data){
177
+ values <- MA.process(data$m, data$conditions, data$two.channel)
178
+
179
+ if (length(values$ratios) == 0){
180
+ return(NULL);
181
+ }else{
182
+ ratios = as.data.frame(values$ratios);
183
+ t = as.data.frame(values$t);
184
+ p.values = as.data.frame(values$p.values);
185
+
186
+
187
+ # Calculate orders from best information
188
+ best = vector();
189
+ names = vector();
190
+ for (name in colnames(ratios)){
191
+ if (sum(colnames(t) == name) > 0){
192
+ best = cbind(best, t[,name]);
193
+ names = c(names, name);
194
+ }else{
195
+ best = cbind(best, ratios[,name]);
196
+ names = c(names, paste(name,'[ratio]', sep=" "));
197
+ }
198
+ }
199
+ rownames(best) <- rownames(ratios)
200
+ orders = as.data.frame(MA.get_order(best));
201
+ colnames(orders) <- names
202
+
203
+ return(list(ratios = ratios, t = t, p.values = p.values, orders = orders));
204
+ }
205
+ }
206
+
207
+
208
+
209
+ GEO.GDS.process <- function(name, prefix, id.field = NULL, translation.file = NULL,cachedir=NULL){
210
+ tryCatch(
211
+ {
212
+ gds.data = GEO.GDS.data(name, id.field, translation.file, cachedir)
213
+ values = GEO.values(gds.data)
214
+ if (is.null(values)){
215
+ write(file=paste(prefix,'skip',sep="."), "No suitable samples for analysis" );
216
+ }else{
217
+ MA.save(prefix, values$orders, values$ratios, values$t, values$p.values, colnames(values$orders), gds.data$description);
218
+ }
219
+ }
220
+ ,
221
+ error=function(x){
222
+ print("Exception caught");
223
+ print(x);
224
+ write(file=paste(prefix,'skip',sep="."), paste("An exception was caught during the analysis.",x,sep="\n") );
225
+ }
226
+ )
227
+ }
228
+
285
229
  GEO.GSE.process <- function(gsms, conditions, prefix, do.log2 = NULL, translation.file = NULL, use.field = NULL, title = NULL, description = NULL,cachedir=NULL){
286
230
  tryCatch(
287
231
  {
@@ -0,0 +1,44 @@
1
+ library(GEOquery);
2
+
3
+ # The original version of the function failed if the dataset had extra probe ids not in the platform.
4
+ # This version fixes that
5
+ "GDS2eSet" <-
6
+ function(GDS,do.log2=FALSE,GPL=NULL,AnnotGPL=TRUE) {
7
+ require(Biobase)
8
+ # exclude non-numeric columns
9
+ if(is.null(GPL)) {
10
+ GPL <- getGEO(Meta(GDS)$platform,AnnotGPL=AnnotGPL)
11
+ }
12
+ ord.table <- match(Table(GDS)[,1],Table(GPL)[,1])
13
+ inc.columns <- grep('GSM',colnames(Table(GDS)))
14
+ mat <- suppressWarnings(as.matrix(apply(Table(GDS)[,inc.columns],2,
15
+ function(x) {as.numeric(as.character(x))})))
16
+ if(do.log2) {
17
+ expr <- log2(mat)
18
+ } else {
19
+ expr <- mat
20
+ }
21
+ rownames(expr) <- as.character(Table(GDS)$ID_REF)
22
+ tmp <- Columns(GDS)
23
+ rownames(tmp) <- as.character(tmp$sample)
24
+ pheno <- new("AnnotatedDataFrame",data=tmp)
25
+ mabstract=ifelse(is.null(Meta(GDS)$description),"",Meta(GDS)$description)
26
+ mpubmedids=ifelse(is.null(Meta(GDS)$pubmed_id),"",Meta(GDS)$pubmed_id)
27
+ mtitle=ifelse(is.null(Meta(GDS)$title),"",Meta(GDS)$title)
28
+ dt <- Table(GPL)
29
+ rownames(dt) <- as.character(dt$ID)
30
+ featuredata <- new('AnnotatedDataFrame',data=dt[ord.table,],
31
+ varMetadata=data.frame(Column=Columns(GPL)[,1],
32
+ labelDescription=Columns(GPL)[,2]))
33
+
34
+ # use !is.na(ord.table) to remove extra probe ids in GDS and not in GPL
35
+ eset <- new('ExpressionSet',exprs=expr[!is.na(ord.table),],phenoData=pheno,
36
+ featureData=featuredata[!is.na(ord.table),],
37
+ experimentData=new("MIAME",
38
+ abstract=mabstract,
39
+ title=mtitle,
40
+ pubMedIds=mpubmedids,
41
+ other=Meta(GDS)))
42
+ return(eset)
43
+ }
44
+
data/R/MARQ.R CHANGED
@@ -1,3 +1,79 @@
1
1
  library('yaml');
2
2
 
3
3
  MARQ.config = yaml.load_file('~/.MARQ');
4
+
5
+
6
+ ####################################################
7
+ # GEO platforms and datasets helper functions
8
+
9
+ MARQ.GEO.path <- function(dataset, cross_platform = FALSE, datadir = NULL){
10
+ if (is.null(datadir) && exists('MARQ.config')){
11
+ datadir= paste(MARQ.config$datadir, 'GEO', sep="/");
12
+ }
13
+
14
+ if (is.null(datadir)){
15
+ print("No datadir specified and no default found (MARQ.config$datadir");
16
+ exit(-1);
17
+ }
18
+
19
+ if ( length(grep('_cross_platform', dataset)) == 0 && cross_platform){
20
+ dataset = paste(dataset, '_cross_platform', sep = "");
21
+
22
+ }
23
+
24
+
25
+ files = Sys.glob(paste(datadir,'*', '*', paste(dataset, 'orders', sep="."), sep="/"));
26
+
27
+ if (length(files) == 0){
28
+ return(NULL);
29
+ }
30
+ else{
31
+ return(sub('.orders','', files[1]));
32
+ }
33
+ }
34
+
35
+ MARQ.GEO.platform <- function(dataset, datadir = NULL){
36
+
37
+ path = MARQ.GEO.path(dataset, datadir = datadir);
38
+
39
+ if (is.null(path)){ return(NULL);}
40
+
41
+ return(sub(".*(GPL\\d+).*","\\1", path, perl = TRUE));
42
+ }
43
+
44
+ MARQ.GEO.platform.path <- function(platform, datadir = NULL){
45
+ if (is.null(datadir) && exists('MARQ.config')){
46
+ datadir= paste(MARQ.config$datadir, 'GEO', sep="/");
47
+ }
48
+
49
+ if (is.null(datadir)){
50
+ print("No datadir specified and no default found (MARQ.config$datadir");
51
+ exit(-1);
52
+ }
53
+
54
+ return(paste(datadir, platform, sep="/"));
55
+ }
56
+
57
+ MARQ.GEO.platform.datasets <- function(platform, cross_platform = TRUE, series = TRUE, datadir = NULL){
58
+ if (cross_platform){
59
+ cp.suffix = '_cross_platform'
60
+ }
61
+ else{
62
+ cp.suffix = ''
63
+ }
64
+
65
+ if (series){
66
+ pattern = '*'
67
+ }
68
+ else{
69
+ pattern = 'GDS'
70
+ }
71
+ files = Sys.glob(paste(MARQ.GEO.platform.path(platform, datadir), pattern, paste('*',cp.suffix,'.orders',sep=""),sep="/"))
72
+
73
+ return(sapply(files, function(path){ sub(".*((?:GDS|GSE)\\d+).*", '\\1', path, perl=TRUE)}, USE.NAMES = FALSE));
74
+ }
75
+
76
+ MARQ.GEO.load <- function(dataset, cross_platform = FALSE, orders = TRUE, logratios = TRUE, t = TRUE, p.values = TRUE){
77
+ return(MA.load(MARQ.GEO.path(dataset, cross_platform), orders, logratios, t, p.values));
78
+ }
79
+
@@ -5,71 +5,144 @@ require 'MARQ/ID'
5
5
  require 'yaml'
6
6
  require 'progress-monitor'
7
7
  require 'MARQ/MADB'
8
+ require 'rbbt/sources/organism'
8
9
 
9
10
  $platform ||= ENV['platform']
10
- $org ||= [$organism, ENV['organism'], nil].reject{|e| e.nil?}.first
11
- $series ||= ENV['series']
11
+ $organism ||= [$organism, ENV['organism'], nil].compact.first
12
+ $dataset ||= ENV['dataset']
12
13
 
13
14
  $expr_threshold ||= (ENV['threshold'] || 0.05).to_f
14
15
  $folds ||= (ENV['folds'] || 2.5).to_f
15
16
  $nth_genes ||= (ENV['nth_genes'] || 100).to_i
16
17
 
17
- $force = [$force, ENV['force'], false]. reject{|e| e.nil?}.first
18
- $update_db = [$update_db, ENV['update_db'], true]. reject{|e| e.nil?}.first
19
- $fdr = [$fdr, ENV['fdr'], true]. reject{|e| e.nil?}.first
20
- $do_folds = [$do_folds, ENV['do_folds'], true]. reject{|e| e.nil?}.first
18
+ $force = [$force, ENV['force'], false].compact.first.to_s == 'true'
19
+ $tranlations = [$tranlations, ENV['translations'], false].compact.first.to_s == 'true'
20
+ $series = [$series, ENV['series'], true].compact.first.to_s == 'true'
21
+ $update_db = [$update_db, ENV['update_db'], false].compact.first.to_s == 'true'
22
+ $skip_db = [$skip_db, ENV['skip_db'], false].compact.first.to_s == 'true'
23
+ $fdr = [$fdr, ENV['fdr'], true].compact.first.to_s == 'true'
24
+ $do_folds = [$do_folds, ENV['do_folds'], true].compact.first.to_s == 'true'
21
25
 
22
26
 
23
27
  # Record changes in order to update DB
24
28
  $changes = false
25
- module GEO
29
+ module GEO::Process::R
26
30
  class << self
27
- alias_method :get_GDS_old, :get_GDS
28
- def get_GDS(*args)
31
+ alias_method :GDS_old, :GDS
32
+ def GDS(*args)
29
33
  $changes = true
30
- get_GDS_old(*args)
34
+ GDS_old(*args)
31
35
  end
32
36
 
33
- alias_method :get_GSE_old, :get_GSE
34
- def get_GSE(*args)
37
+ alias_method :GSE_old, :GSE
38
+ def GSE(*args)
35
39
  $changes = true
36
- get_GSE_old(*args)
40
+ GSE_old(*args)
37
41
  end
38
42
  end
39
43
  end
40
44
 
41
45
 
42
- def process_platform(platform, update_db = false)
43
- begin
44
- $changes = false
45
- GEO.process_platform(platform)
46
- GEO.process_platform_datasets(platform, $force && !update_db)
47
- if update_db && ($changes || $force)
48
- puts "Saving #{platform}"
49
- MADB::GEO.saveGPL(platform) if update_db && ($changes || $force)
50
- end
51
- rescue
52
- puts $!.message
46
+ def process_list
47
+ return {GEO::Remote::dataset_platform($dataset) => [$dataset]} if $dataset
48
+ return {$platform => GEO::Remote::platform_datasets($platform)} if $platform
49
+
50
+ if $organism
51
+ organisms = [$organism]
52
+ else
53
+ organisms = Organism.all(true)
54
+ end
55
+
56
+ list = {}
57
+ organisms.each{|organism|
58
+ GEO::Remote::organism_platforms(organism).each{|platform|
59
+ datasets = GEO::Remote::platform_datasets(platform)
60
+
61
+ # Platforms with no datasets are skiped, although if they have series
62
+ # they may be considered later
63
+ next if datasets.empty?
64
+
65
+ if $tranlations || $update_db
66
+ list[platform] = []
67
+ else
68
+ if ! $force
69
+ datasets.reject!{|dataset| Dir.glob(File.join(GEO.dataset_path(dataset, platform) || "MISSING", '.*')).any? }
70
+ end
71
+ list[platform] = datasets
72
+ end
73
+ }
74
+ }
75
+
76
+ if $series && ! $translations
77
+ series = Dir.glob('series/*.yaml').collect{|f| File.basename(f).sub(/.yaml/,'')}
78
+ series.each{|serie|
79
+ platform = GEO::SOFT.GSE(serie)[:platform]
80
+ begin
81
+ if organisms.include? GEO::SOFT::GPL(platform)[:organism].to_s
82
+ list[platform] ||= []
83
+ list[platform] << serie unless $tranlations || $update_db
84
+ end
85
+ rescue
86
+ puts "Error process series #{serie} platform #{platform}"
87
+ puts $!.message
88
+ puts $!.backtrace.join("\n")
89
+ end
90
+ }
53
91
  end
92
+
93
+ return list
54
94
  end
55
95
 
56
- def process_serie(serie, update_db = false)
57
- begin
96
+ desc "Analyze datasets"
97
+ task 'data' do
98
+
99
+ platforms_to_save = []
100
+
101
+ process_list.each{|platform, datasets|
102
+
103
+ begin
104
+ # Prepare the platform
105
+ GEO::Process.GPL(platform)
106
+ rescue
107
+ puts "Error processing platform #{platform}"
108
+ puts $!.message
109
+ puts $!.backtrace.join("\n")
110
+ next
111
+ end
112
+
58
113
  $changes = false
59
- info = YAML::load(File.open("series/#{ serie }.yaml"))
114
+ # Process all datasets
115
+ datasets.each{|dataset|
116
+ begin
117
+ if dataset =~ /GDS/
118
+ GEO::Process.GDS(dataset, platform)
119
+ else
120
+ info = YAML::load(File.open("series/#{ dataset }.yaml"))
121
+ FileUtils.rm("platforms/#{ info[:platform] }.skip") if File.exist? "platforms/#{ info[:platform] }.skip"
122
+ GEO::Process.GSE(dataset, info)
123
+ end
124
+
125
+ # Mark the platform for saving in DB
126
+ rescue
127
+ puts "Error processing dataset #{ dataset }"
128
+ puts $!.message
129
+ puts $!.backtrace.join("\n")
130
+ end
131
+ }
60
132
 
61
- FileUtils.rm("platforms/#{ info[:platform] }.skip") if File.exist? "platforms/#{ info[:platform] }.skip"
62
- GEO.process_platform(info[:platform]) if !File.exist? info[:platform]
133
+ platforms_to_save << platform if $changes || $update_db
134
+ }
63
135
 
64
- GEO.process_GSE(serie, info)
65
- if info[:platform] && update_db
66
- puts "Saving #{info[:platform]}"
67
- MADB::GEO.saveGPL(info[:platform]) if info[:platform] && update_db
136
+ platforms_to_save.each{|platform|
137
+ begin
138
+ puts "Saving #{platform}"
139
+ MADB::GEO.saveGPL(platform)
140
+ rescue
141
+ puts "Error saving platform #{ platform }"
142
+ puts $!.message
143
+ puts $!.backtrace.join("\n")
68
144
  end
69
- info[:platform]
70
- rescue
71
- puts $!.message
72
- end
145
+ }
73
146
  end
74
147
 
75
148
  def annotations(name, cross_platform = false, &block)
@@ -98,7 +171,7 @@ def annotations(name, cross_platform = false, &block)
98
171
  end
99
172
  }
100
173
  return if $platform
101
-
174
+
102
175
  Progress.monitor("Annotating with #{ name }, series")
103
176
  series = Dir.glob('series/*.yaml').collect{|f| File.basename(f).sub(/.yaml/,'')}
104
177
  series.each{|serie|
@@ -125,42 +198,6 @@ def goterms(org, list, slim, threshold)
125
198
  end
126
199
 
127
200
 
128
- task 'data' do
129
- if $platform
130
- process_platform($platform, $update_db)
131
- elsif $series
132
- process_serie($series, $update_db)
133
- elsif $org
134
- org = $org
135
- platforms = GEO::Eutils.organism_platforms(org).select{|platform| GEO::Eutils.GPL_datasets(platform).any?}
136
- Progress.monitor("Platforms for #{Organism.name(org)}")
137
- platforms.each{|platform|
138
- puts "Platform #{ platform }"
139
- process_platform(platform,$update_db)
140
- }
141
- else
142
- # Series
143
- series = Dir.glob('series/*.yaml').collect{|f| File.basename(f).sub(/.yaml/,'')}
144
- platform2save = []
145
- series.each{|serie|
146
- platform = process_serie(serie, false)
147
- platform2save << platform if $update_db && ($changes || $force)
148
- }
149
-
150
- Progress.monitor("Saving #{platform2save.uniq.length} platforms: ")
151
- platform2save.uniq.each{|platform| MADB::GEO.saveGPL(platform) }
152
-
153
- # Platforms
154
- for org in Organism.all
155
- platforms = GEO::Eutils.organism_platforms(org).select{|platform| GEO::Eutils.GPL_datasets(platform).any?}
156
- Progress.monitor("Platforms for #{Organism.name(org)}")
157
- platforms.each{|platform|
158
- puts "Platform #{ platform }"
159
- process_platform(platform,$update_db)
160
- }
161
- end
162
- end
163
- end
164
201
 
165
202
  task 'annotate_Words' do
166
203
  require 'MARQ/annotations'