rbbt-GE 0.2.0 → 1.0.0

Sign up to get free protection for your applications and to get access to all the features.
data/lib/rbbt/GE/GEO.rb CHANGED
@@ -1,3 +1,4 @@
1
+ # encoding: utf-8
1
2
  require 'rbbt-util'
2
3
  require 'rbbt/GE'
3
4
  require 'rbbt/sources/organism'
@@ -9,7 +10,7 @@ module GEO
9
10
  self.pkgdir = "geo"
10
11
  self.subdir = "arrays"
11
12
 
12
- GEO.claim GEO.root.find(:user), :rake, Rbbt.share.install.GEO.Rakefile.find(:lib)
13
+ GEO.claim GEO.root, :rake, Rbbt.share.install.GEO.Rakefile.find(:lib)
13
14
 
14
15
  def self.comparison_name(field, condition, control)
15
16
  condition = condition * " AND " if Array === condition
@@ -74,11 +75,12 @@ module GEO
74
75
  }
75
76
 
76
77
  GDS_INFO = {
77
- :DELIMITER => "\\^SUBSET|!sample_table_begin",
78
- :title => "!Sample_title",
79
- :accession => "!Sample_geo_accession",
80
- :channel_count => "!Sample_channel_count",
81
- :platform => "!Sample_platform_id",
78
+ :DELIMITER => "\\^SUBSET",
79
+ :value_type => "!dataset_value_type",
80
+ :channel_count => "!dataset_channel_count",
81
+ :platform => "!dataset_platform",
82
+ :reference_series => "!dataset_reference_series",
83
+ :description => "!dataset_description"
82
84
  }
83
85
 
84
86
  GDS_SUBSET_INFO = {
@@ -149,7 +151,7 @@ module GEO
149
151
  end
150
152
 
151
153
  def self.guess_id(organism, codes)
152
- num_codes = codes.size
154
+ num_codes = codes.length
153
155
  best = nil
154
156
  best_count = 0
155
157
  new_fields = []
@@ -160,9 +162,15 @@ module GEO
160
162
  values = CMD.cmd("cat #{ codefile }|cut -f #{ i + 1 }| tr '|' '\\n'|grep [[:alpha:]]|sort -u").read.split("\n").reject{|code| code.empty?}
161
163
 
162
164
  new_field, count = Organism.guess_id(organism, values)
165
+ new_field ||= field
166
+ count ||= 0
167
+ new_field = "UNKNOWN(#{new_field})" unless count > (num_codes > 20000 ? 20000 : num_codes).to_f * 0.2 and count > values.uniq.length * 0.5
168
+
169
+ Log.debug "Original field: #{ field }. New: #{new_field}. Count: #{ count }/#{num_codes}/#{values.uniq.length}"
170
+ new_fields << new_field
171
+
163
172
  field_counts[new_field] = count
164
- Log.debug "Original field: #{ field }. New: #{new_field}. Count: #{ count }/#{num_codes}"
165
- new_fields << (count > (num_codes > 20000 ? 20000 : num_codes).to_f * 0.5 ? new_field : "UNKNOWN(#{ field })")
173
+
166
174
  if count > best_count
167
175
  best = new_field
168
176
  best_count = count
@@ -184,7 +192,8 @@ module GEO
184
192
  code_file = File.join(directory, 'codes')
185
193
  info_file = File.join(directory, 'info.yaml')
186
194
 
187
- stream = Open.open(GPL_URL.gsub('#PLATFORM#', platform), :nocache => true, :pipe => true)
195
+ # Fix platforms with the '.\d' extension (eg. NM_020527.1)
196
+ stream = CMD.cmd('sed \'s/\.[[:digit:]]\+\(\t\|$\)/\1/g;s/ *\/\/[^\t]*//g\'', :in => Open.open(GPL_URL.gsub('#PLATFORM#', platform), :nocache => true), :pipe => true)
188
197
 
189
198
  info = parse_header(stream, GPL_INFO)
190
199
  info[:code_file] = code_file
@@ -278,7 +287,9 @@ module GEO
278
287
 
279
288
  samples << sample
280
289
 
281
- sample_values = TSV.open(StringIO.new(chunk.match(/!sample_table_begin(.*)!sample_table_end/msi)[0].strip), :type => :list, :header_hash => '')
290
+ chunk = chunk.encode "UTF-8"
291
+ chunk = Misc.fixutf8 chunk
292
+ sample_values = TSV.open(StringIO.new(chunk.match(/!sample_table_begin\n(.*)\n!sample_table_end/mi)[1].strip), :type => :list, :header_hash => '')
282
293
  sample_values.fields = [sample]
283
294
 
284
295
  if values.nil?
@@ -310,12 +321,14 @@ module GEO
310
321
  key_field = TSV.parse_header(GEO[info[:platform]]['codes'].open).key_field
311
322
  values.key_field = key_field
312
323
 
313
- Open.write(value_file, values.to_s)
314
- Open.write(info_file, info.to_yaml)
315
-
324
+ info[:sample_info] ||= sample_info
316
325
  info[:channel_count] ||= sample_info.values.first[:channel_count]
317
326
  info[:value_type] ||= sample_info.values.first[:value_type]
318
327
 
328
+
329
+ Open.write(value_file, values.to_s)
330
+ Open.write(info_file, info.to_yaml)
331
+
319
332
  info
320
333
  end
321
334
  end
@@ -346,3 +359,4 @@ module GEO
346
359
  GE.analyze(value_file, condition_samples, control_samples, log2, path, format)
347
360
  end
348
361
  end
362
+
data/lib/rbbt/GE.rb CHANGED
@@ -9,15 +9,18 @@ module GE
9
9
  R.run(cmd)
10
10
  end
11
11
 
12
- def self.r_format(list)
12
+ def self.r_format(list, options = {})
13
+ strings = options[:strings]
13
14
  case
14
15
  when list.nil?
15
16
  "NULL"
16
17
  when Array === list
17
- "c(#{list.collect{|e| r_format e} * ", "})"
18
- when (String === list and list === list.to_i.to_s)
18
+ "c(#{list.collect{|e| r_format e, options} * ", "})"
19
+ when (Fixnum === list or Float === list)
20
+ list.to_s
21
+ when (not strings and String === list and list === list.to_i.to_s)
19
22
  list.to_i
20
- when (String === list and list === list.to_f.to_s)
23
+ when (not strings and String === list and list === list.to_f.to_s)
21
24
  list.to_f
22
25
  when TrueClass === list
23
26
  "TRUE"
@@ -28,9 +31,9 @@ module GE
28
31
  end
29
32
  end
30
33
 
31
- def self.analyze(datafile, main, contrast = nil, log2 = false, outfile = nil, key_field = nil)
34
+ def self.analyze(datafile, main, contrast = nil, log2 = false, outfile = nil, key_field = nil, two_channel = nil)
32
35
  FileUtils.mkdir_p File.dirname(outfile) unless outfile.nil? or File.exists? File.dirname(outfile)
33
- GE.run_R("rbbt.GE.process(#{ r_format datafile }, main = #{r_format(main)}, contrast = #{r_format(contrast)}, log2=#{ r_format log2 }, outfile = #{r_format outfile}, key.field = #{r_format key_field})")
36
+ GE.run_R("rbbt.GE.process(#{ r_format datafile }, main = #{r_format(main, :strings => true)}, contrast = #{r_format(contrast, :strings => true)}, log2=#{ r_format log2 }, outfile = #{r_format outfile}, key.field = #{r_format key_field}, two.channel = #{r_format two_channel})")
34
37
  end
35
38
  end
36
39
 
@@ -6,6 +6,26 @@ rule /^(GPL\d+)\/?(codes|info\.yaml)?$/ do |t|
6
6
  GEO::SOFT.GPL(platform, file.nil? ? t.name : File.dirname(t.name))
7
7
  end
8
8
 
9
+ rule /^(GDS\d+)\/samples$/ => [proc{|t| t.sub('samples', 'info.yaml')}, proc{|t| t.sub('samples', 'values')} ] do |t|
10
+ info = YAML.load(Open.open(t.prerequisites.first))
11
+
12
+ subsets = info[:subsets]
13
+ fields = subsets.keys
14
+
15
+ all_samples = TSV::Parser.new(Open.open(t.prerequisites.last)).fields
16
+ samples = TSV.setup(all_samples, :key_field => "Sample", :fields => [], :type => :list)
17
+
18
+ fields.each do |field|
19
+ assignments = subsets[field]
20
+
21
+ samples = samples.add_field field do |sample, v|
22
+ assignments.select{|value, list| list.include? sample}.first.first
23
+ end
24
+ end
25
+
26
+ Open.write(t.name, samples.to_s)
27
+ end
28
+
9
29
  rule /^(GDS\d+)\/?(values|info\.yaml)?$/ do |t|
10
30
  t.name =~ /^(GDS\d+)\/?(values|info\.yaml)?/
11
31
  dataset = $1
data/share/lib/R/MA.R CHANGED
@@ -55,13 +55,31 @@ rbbt.GE.process.limma.twoside <- function(expr, subset.main, subset.contrast){
55
55
  }
56
56
 
57
57
 
58
+ rbbt.GE.guess.log2 <- function(m, two.channel){
59
+ if (two.channel){
60
+ return (sum(m < 0, na.rm = TRUE) == 0);
61
+ }else{
62
+ return (max(m, na.rm = TRUE) > 100);
63
+ }
64
+ }
65
+
66
+
58
67
 
59
- rbbt.GE.process <- function(file, main, contrast = NULL, log2 = FALSE, outfile = NULL, key.field = NULL){
68
+ rbbt.GE.process <- function(file, main, contrast = NULL, log2 = FALSE, outfile = NULL, key.field = NULL, two.channel = NULL){
60
69
  data = rbbt.tsv(file);
61
70
  ids = rownames(data);
62
71
 
72
+ if (is.null(log2)){
73
+ print(str(data));
74
+ log2 = rbbt.GE.guess.log2(data, two.channel)
75
+ }
76
+
63
77
  if (log2){
64
78
  data = log2(data);
79
+ min = min(data[data != -Inf])
80
+ data[data == -Inf] = min
81
+ print(summary(data));
82
+ return
65
83
  }
66
84
 
67
85
  if (is.null(contrast)){
@@ -103,6 +121,7 @@ rbbt.GE.process <- function(file, main, contrast = NULL, log2 = FALSE, outfile =
103
121
 
104
122
  if (! is.null(limma)){
105
123
  result = data.frame(ratio = ratio[ids], t.values = limma$t[ids], p.values = limma$p.values[ids])
124
+ result["adjusted.p.values"] = p.adjust(result$p.values, "fdr")
106
125
  }else{
107
126
  result = data.frame(ratio = ratio)
108
127
  }
metadata CHANGED
@@ -1,45 +1,39 @@
1
- --- !ruby/object:Gem::Specification
1
+ --- !ruby/object:Gem::Specification
2
2
  name: rbbt-GE
3
- version: !ruby/object:Gem::Version
4
- hash: 23
3
+ version: !ruby/object:Gem::Version
4
+ version: 1.0.0
5
5
  prerelease:
6
- segments:
7
- - 0
8
- - 2
9
- - 0
10
- version: 0.2.0
11
6
  platform: ruby
12
- authors:
7
+ authors:
13
8
  - Miguel Vazquez
14
9
  autorequire:
15
10
  bindir: bin
16
11
  cert_chain: []
17
-
18
- date: 2012-01-31 00:00:00 Z
19
- dependencies:
20
- - !ruby/object:Gem::Dependency
12
+ date: 2012-12-21 00:00:00.000000000 Z
13
+ dependencies:
14
+ - !ruby/object:Gem::Dependency
21
15
  name: rbbt-util
22
- prerelease: false
23
- requirement: &id001 !ruby/object:Gem::Requirement
16
+ requirement: !ruby/object:Gem::Requirement
24
17
  none: false
25
- requirements:
26
- - - ">="
27
- - !ruby/object:Gem::Version
28
- hash: 3
29
- segments:
30
- - 0
31
- version: "0"
18
+ requirements:
19
+ - - ! '>='
20
+ - !ruby/object:Gem::Version
21
+ version: '0'
32
22
  type: :runtime
33
- version_requirements: *id001
23
+ prerelease: false
24
+ version_requirements: !ruby/object:Gem::Requirement
25
+ none: false
26
+ requirements:
27
+ - - ! '>='
28
+ - !ruby/object:Gem::Version
29
+ version: '0'
34
30
  description: Gene Expression in RBBT
35
31
  email: miguel.vazquez@cnio.es
36
32
  executables: []
37
-
38
33
  extensions: []
39
-
40
- extra_rdoc_files:
34
+ extra_rdoc_files:
41
35
  - LICENSE
42
- files:
36
+ files:
43
37
  - LICENSE
44
38
  - lib/rbbt/GE.rb
45
39
  - lib/rbbt/GE/GEO.rb
@@ -49,37 +43,28 @@ files:
49
43
  - test/rbbt/GE/test_GEO.rb
50
44
  homepage: http://github.com/mikisvaz/rbbt-GE
51
45
  licenses: []
52
-
53
46
  post_install_message:
54
47
  rdoc_options: []
55
-
56
- require_paths:
48
+ require_paths:
57
49
  - lib
58
- required_ruby_version: !ruby/object:Gem::Requirement
50
+ required_ruby_version: !ruby/object:Gem::Requirement
59
51
  none: false
60
- requirements:
61
- - - ">="
62
- - !ruby/object:Gem::Version
63
- hash: 3
64
- segments:
65
- - 0
66
- version: "0"
67
- required_rubygems_version: !ruby/object:Gem::Requirement
52
+ requirements:
53
+ - - ! '>='
54
+ - !ruby/object:Gem::Version
55
+ version: '0'
56
+ required_rubygems_version: !ruby/object:Gem::Requirement
68
57
  none: false
69
- requirements:
70
- - - ">="
71
- - !ruby/object:Gem::Version
72
- hash: 3
73
- segments:
74
- - 0
75
- version: "0"
58
+ requirements:
59
+ - - ! '>='
60
+ - !ruby/object:Gem::Version
61
+ version: '0'
76
62
  requirements: []
77
-
78
63
  rubyforge_project:
79
- rubygems_version: 1.8.10
64
+ rubygems_version: 1.8.24
80
65
  signing_key:
81
66
  specification_version: 3
82
67
  summary: Gene Expression in RBBT
83
- test_files:
68
+ test_files:
84
69
  - test/test_helper.rb
85
70
  - test/rbbt/GE/test_GEO.rb