rbbt-GE 0.2.0 → 1.0.0
Sign up to get free protection for your applications and to get access to all the features.
- data/lib/rbbt/GE/GEO.rb +28 -14
- data/lib/rbbt/GE.rb +9 -6
- data/share/install/GEO/Rakefile +20 -0
- data/share/lib/R/MA.R +20 -1
- metadata +34 -49
data/lib/rbbt/GE/GEO.rb
CHANGED
@@ -1,3 +1,4 @@
|
|
1
|
+
# encoding: utf-8
|
1
2
|
require 'rbbt-util'
|
2
3
|
require 'rbbt/GE'
|
3
4
|
require 'rbbt/sources/organism'
|
@@ -9,7 +10,7 @@ module GEO
|
|
9
10
|
self.pkgdir = "geo"
|
10
11
|
self.subdir = "arrays"
|
11
12
|
|
12
|
-
GEO.claim GEO.root
|
13
|
+
GEO.claim GEO.root, :rake, Rbbt.share.install.GEO.Rakefile.find(:lib)
|
13
14
|
|
14
15
|
def self.comparison_name(field, condition, control)
|
15
16
|
condition = condition * " AND " if Array === condition
|
@@ -74,11 +75,12 @@ module GEO
|
|
74
75
|
}
|
75
76
|
|
76
77
|
GDS_INFO = {
|
77
|
-
:DELIMITER => "\\^SUBSET
|
78
|
-
:
|
79
|
-
:
|
80
|
-
:
|
81
|
-
:
|
78
|
+
:DELIMITER => "\\^SUBSET",
|
79
|
+
:value_type => "!dataset_value_type",
|
80
|
+
:channel_count => "!dataset_channel_count",
|
81
|
+
:platform => "!dataset_platform",
|
82
|
+
:reference_series => "!dataset_reference_series",
|
83
|
+
:description => "!dataset_description"
|
82
84
|
}
|
83
85
|
|
84
86
|
GDS_SUBSET_INFO = {
|
@@ -149,7 +151,7 @@ module GEO
|
|
149
151
|
end
|
150
152
|
|
151
153
|
def self.guess_id(organism, codes)
|
152
|
-
num_codes = codes.
|
154
|
+
num_codes = codes.length
|
153
155
|
best = nil
|
154
156
|
best_count = 0
|
155
157
|
new_fields = []
|
@@ -160,9 +162,15 @@ module GEO
|
|
160
162
|
values = CMD.cmd("cat #{ codefile }|cut -f #{ i + 1 }| tr '|' '\\n'|grep [[:alpha:]]|sort -u").read.split("\n").reject{|code| code.empty?}
|
161
163
|
|
162
164
|
new_field, count = Organism.guess_id(organism, values)
|
165
|
+
new_field ||= field
|
166
|
+
count ||= 0
|
167
|
+
new_field = "UNKNOWN(#{new_field})" unless count > (num_codes > 20000 ? 20000 : num_codes).to_f * 0.2 and count > values.uniq.length * 0.5
|
168
|
+
|
169
|
+
Log.debug "Original field: #{ field }. New: #{new_field}. Count: #{ count }/#{num_codes}/#{values.uniq.length}"
|
170
|
+
new_fields << new_field
|
171
|
+
|
163
172
|
field_counts[new_field] = count
|
164
|
-
|
165
|
-
new_fields << (count > (num_codes > 20000 ? 20000 : num_codes).to_f * 0.5 ? new_field : "UNKNOWN(#{ field })")
|
173
|
+
|
166
174
|
if count > best_count
|
167
175
|
best = new_field
|
168
176
|
best_count = count
|
@@ -184,7 +192,8 @@ module GEO
|
|
184
192
|
code_file = File.join(directory, 'codes')
|
185
193
|
info_file = File.join(directory, 'info.yaml')
|
186
194
|
|
187
|
-
|
195
|
+
# Fix platforms with the '.\d' extension (eg. NM_020527.1)
|
196
|
+
stream = CMD.cmd('sed \'s/\.[[:digit:]]\+\(\t\|$\)/\1/g;s/ *\/\/[^\t]*//g\'', :in => Open.open(GPL_URL.gsub('#PLATFORM#', platform), :nocache => true), :pipe => true)
|
188
197
|
|
189
198
|
info = parse_header(stream, GPL_INFO)
|
190
199
|
info[:code_file] = code_file
|
@@ -278,7 +287,9 @@ module GEO
|
|
278
287
|
|
279
288
|
samples << sample
|
280
289
|
|
281
|
-
|
290
|
+
chunk = chunk.encode "UTF-8"
|
291
|
+
chunk = Misc.fixutf8 chunk
|
292
|
+
sample_values = TSV.open(StringIO.new(chunk.match(/!sample_table_begin\n(.*)\n!sample_table_end/mi)[1].strip), :type => :list, :header_hash => '')
|
282
293
|
sample_values.fields = [sample]
|
283
294
|
|
284
295
|
if values.nil?
|
@@ -310,12 +321,14 @@ module GEO
|
|
310
321
|
key_field = TSV.parse_header(GEO[info[:platform]]['codes'].open).key_field
|
311
322
|
values.key_field = key_field
|
312
323
|
|
313
|
-
|
314
|
-
Open.write(info_file, info.to_yaml)
|
315
|
-
|
324
|
+
info[:sample_info] ||= sample_info
|
316
325
|
info[:channel_count] ||= sample_info.values.first[:channel_count]
|
317
326
|
info[:value_type] ||= sample_info.values.first[:value_type]
|
318
327
|
|
328
|
+
|
329
|
+
Open.write(value_file, values.to_s)
|
330
|
+
Open.write(info_file, info.to_yaml)
|
331
|
+
|
319
332
|
info
|
320
333
|
end
|
321
334
|
end
|
@@ -346,3 +359,4 @@ module GEO
|
|
346
359
|
GE.analyze(value_file, condition_samples, control_samples, log2, path, format)
|
347
360
|
end
|
348
361
|
end
|
362
|
+
|
data/lib/rbbt/GE.rb
CHANGED
@@ -9,15 +9,18 @@ module GE
|
|
9
9
|
R.run(cmd)
|
10
10
|
end
|
11
11
|
|
12
|
-
def self.r_format(list)
|
12
|
+
def self.r_format(list, options = {})
|
13
|
+
strings = options[:strings]
|
13
14
|
case
|
14
15
|
when list.nil?
|
15
16
|
"NULL"
|
16
17
|
when Array === list
|
17
|
-
"c(#{list.collect{|e| r_format e} * ", "})"
|
18
|
-
when (
|
18
|
+
"c(#{list.collect{|e| r_format e, options} * ", "})"
|
19
|
+
when (Fixnum === list or Float === list)
|
20
|
+
list.to_s
|
21
|
+
when (not strings and String === list and list === list.to_i.to_s)
|
19
22
|
list.to_i
|
20
|
-
when (String === list and list === list.to_f.to_s)
|
23
|
+
when (not strings and String === list and list === list.to_f.to_s)
|
21
24
|
list.to_f
|
22
25
|
when TrueClass === list
|
23
26
|
"TRUE"
|
@@ -28,9 +31,9 @@ module GE
|
|
28
31
|
end
|
29
32
|
end
|
30
33
|
|
31
|
-
def self.analyze(datafile, main, contrast = nil, log2 = false, outfile = nil, key_field = nil)
|
34
|
+
def self.analyze(datafile, main, contrast = nil, log2 = false, outfile = nil, key_field = nil, two_channel = nil)
|
32
35
|
FileUtils.mkdir_p File.dirname(outfile) unless outfile.nil? or File.exists? File.dirname(outfile)
|
33
|
-
GE.run_R("rbbt.GE.process(#{ r_format datafile }, main = #{r_format(main)}, contrast = #{r_format(contrast)}, log2=#{ r_format log2 }, outfile = #{r_format outfile}, key.field = #{r_format key_field})")
|
36
|
+
GE.run_R("rbbt.GE.process(#{ r_format datafile }, main = #{r_format(main, :strings => true)}, contrast = #{r_format(contrast, :strings => true)}, log2=#{ r_format log2 }, outfile = #{r_format outfile}, key.field = #{r_format key_field}, two.channel = #{r_format two_channel})")
|
34
37
|
end
|
35
38
|
end
|
36
39
|
|
data/share/install/GEO/Rakefile
CHANGED
@@ -6,6 +6,26 @@ rule /^(GPL\d+)\/?(codes|info\.yaml)?$/ do |t|
|
|
6
6
|
GEO::SOFT.GPL(platform, file.nil? ? t.name : File.dirname(t.name))
|
7
7
|
end
|
8
8
|
|
9
|
+
rule /^(GDS\d+)\/samples$/ => [proc{|t| t.sub('samples', 'info.yaml')}, proc{|t| t.sub('samples', 'values')} ] do |t|
|
10
|
+
info = YAML.load(Open.open(t.prerequisites.first))
|
11
|
+
|
12
|
+
subsets = info[:subsets]
|
13
|
+
fields = subsets.keys
|
14
|
+
|
15
|
+
all_samples = TSV::Parser.new(Open.open(t.prerequisites.last)).fields
|
16
|
+
samples = TSV.setup(all_samples, :key_field => "Sample", :fields => [], :type => :list)
|
17
|
+
|
18
|
+
fields.each do |field|
|
19
|
+
assignments = subsets[field]
|
20
|
+
|
21
|
+
samples = samples.add_field field do |sample, v|
|
22
|
+
assignments.select{|value, list| list.include? sample}.first.first
|
23
|
+
end
|
24
|
+
end
|
25
|
+
|
26
|
+
Open.write(t.name, samples.to_s)
|
27
|
+
end
|
28
|
+
|
9
29
|
rule /^(GDS\d+)\/?(values|info\.yaml)?$/ do |t|
|
10
30
|
t.name =~ /^(GDS\d+)\/?(values|info\.yaml)?/
|
11
31
|
dataset = $1
|
data/share/lib/R/MA.R
CHANGED
@@ -55,13 +55,31 @@ rbbt.GE.process.limma.twoside <- function(expr, subset.main, subset.contrast){
|
|
55
55
|
}
|
56
56
|
|
57
57
|
|
58
|
+
rbbt.GE.guess.log2 <- function(m, two.channel){
|
59
|
+
if (two.channel){
|
60
|
+
return (sum(m < 0, na.rm = TRUE) == 0);
|
61
|
+
}else{
|
62
|
+
return (max(m, na.rm = TRUE) > 100);
|
63
|
+
}
|
64
|
+
}
|
65
|
+
|
66
|
+
|
58
67
|
|
59
|
-
rbbt.GE.process <- function(file, main, contrast = NULL, log2 = FALSE, outfile = NULL, key.field = NULL){
|
68
|
+
rbbt.GE.process <- function(file, main, contrast = NULL, log2 = FALSE, outfile = NULL, key.field = NULL, two.channel = NULL){
|
60
69
|
data = rbbt.tsv(file);
|
61
70
|
ids = rownames(data);
|
62
71
|
|
72
|
+
if (is.null(log2)){
|
73
|
+
print(str(data));
|
74
|
+
log2 = rbbt.GE.guess.log2(data, two.channel)
|
75
|
+
}
|
76
|
+
|
63
77
|
if (log2){
|
64
78
|
data = log2(data);
|
79
|
+
min = min(data[data != -Inf])
|
80
|
+
data[data == -Inf] = min
|
81
|
+
print(summary(data));
|
82
|
+
return
|
65
83
|
}
|
66
84
|
|
67
85
|
if (is.null(contrast)){
|
@@ -103,6 +121,7 @@ rbbt.GE.process <- function(file, main, contrast = NULL, log2 = FALSE, outfile =
|
|
103
121
|
|
104
122
|
if (! is.null(limma)){
|
105
123
|
result = data.frame(ratio = ratio[ids], t.values = limma$t[ids], p.values = limma$p.values[ids])
|
124
|
+
result["adjusted.p.values"] = p.adjust(result$p.values, "fdr")
|
106
125
|
}else{
|
107
126
|
result = data.frame(ratio = ratio)
|
108
127
|
}
|
metadata
CHANGED
@@ -1,45 +1,39 @@
|
|
1
|
-
--- !ruby/object:Gem::Specification
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
2
|
name: rbbt-GE
|
3
|
-
version: !ruby/object:Gem::Version
|
4
|
-
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 1.0.0
|
5
5
|
prerelease:
|
6
|
-
segments:
|
7
|
-
- 0
|
8
|
-
- 2
|
9
|
-
- 0
|
10
|
-
version: 0.2.0
|
11
6
|
platform: ruby
|
12
|
-
authors:
|
7
|
+
authors:
|
13
8
|
- Miguel Vazquez
|
14
9
|
autorequire:
|
15
10
|
bindir: bin
|
16
11
|
cert_chain: []
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
- !ruby/object:Gem::Dependency
|
12
|
+
date: 2012-12-21 00:00:00.000000000 Z
|
13
|
+
dependencies:
|
14
|
+
- !ruby/object:Gem::Dependency
|
21
15
|
name: rbbt-util
|
22
|
-
|
23
|
-
requirement: &id001 !ruby/object:Gem::Requirement
|
16
|
+
requirement: !ruby/object:Gem::Requirement
|
24
17
|
none: false
|
25
|
-
requirements:
|
26
|
-
- -
|
27
|
-
- !ruby/object:Gem::Version
|
28
|
-
|
29
|
-
segments:
|
30
|
-
- 0
|
31
|
-
version: "0"
|
18
|
+
requirements:
|
19
|
+
- - ! '>='
|
20
|
+
- !ruby/object:Gem::Version
|
21
|
+
version: '0'
|
32
22
|
type: :runtime
|
33
|
-
|
23
|
+
prerelease: false
|
24
|
+
version_requirements: !ruby/object:Gem::Requirement
|
25
|
+
none: false
|
26
|
+
requirements:
|
27
|
+
- - ! '>='
|
28
|
+
- !ruby/object:Gem::Version
|
29
|
+
version: '0'
|
34
30
|
description: Gene Expression in RBBT
|
35
31
|
email: miguel.vazquez@cnio.es
|
36
32
|
executables: []
|
37
|
-
|
38
33
|
extensions: []
|
39
|
-
|
40
|
-
extra_rdoc_files:
|
34
|
+
extra_rdoc_files:
|
41
35
|
- LICENSE
|
42
|
-
files:
|
36
|
+
files:
|
43
37
|
- LICENSE
|
44
38
|
- lib/rbbt/GE.rb
|
45
39
|
- lib/rbbt/GE/GEO.rb
|
@@ -49,37 +43,28 @@ files:
|
|
49
43
|
- test/rbbt/GE/test_GEO.rb
|
50
44
|
homepage: http://github.com/mikisvaz/rbbt-GE
|
51
45
|
licenses: []
|
52
|
-
|
53
46
|
post_install_message:
|
54
47
|
rdoc_options: []
|
55
|
-
|
56
|
-
require_paths:
|
48
|
+
require_paths:
|
57
49
|
- lib
|
58
|
-
required_ruby_version: !ruby/object:Gem::Requirement
|
50
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
59
51
|
none: false
|
60
|
-
requirements:
|
61
|
-
- -
|
62
|
-
- !ruby/object:Gem::Version
|
63
|
-
|
64
|
-
|
65
|
-
- 0
|
66
|
-
version: "0"
|
67
|
-
required_rubygems_version: !ruby/object:Gem::Requirement
|
52
|
+
requirements:
|
53
|
+
- - ! '>='
|
54
|
+
- !ruby/object:Gem::Version
|
55
|
+
version: '0'
|
56
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
68
57
|
none: false
|
69
|
-
requirements:
|
70
|
-
- -
|
71
|
-
- !ruby/object:Gem::Version
|
72
|
-
|
73
|
-
segments:
|
74
|
-
- 0
|
75
|
-
version: "0"
|
58
|
+
requirements:
|
59
|
+
- - ! '>='
|
60
|
+
- !ruby/object:Gem::Version
|
61
|
+
version: '0'
|
76
62
|
requirements: []
|
77
|
-
|
78
63
|
rubyforge_project:
|
79
|
-
rubygems_version: 1.8.
|
64
|
+
rubygems_version: 1.8.24
|
80
65
|
signing_key:
|
81
66
|
specification_version: 3
|
82
67
|
summary: Gene Expression in RBBT
|
83
|
-
test_files:
|
68
|
+
test_files:
|
84
69
|
- test/test_helper.rb
|
85
70
|
- test/rbbt/GE/test_GEO.rb
|