rbbt-sent 1.0.0

Sign up to get free protection for your applications and to get access to all the features.
data/LICENSE ADDED
@@ -0,0 +1,20 @@
1
+ Copyright (c) 2009 Miguel Vazquez
2
+
3
+ Permission is hereby granted, free of charge, to any person obtaining
4
+ a copy of this software and associated documentation files (the
5
+ "Software"), to deal in the Software without restriction, including
6
+ without limitation the rights to use, copy, modify, merge, publish,
7
+ distribute, sublicense, and/or sell copies of the Software, and to
8
+ permit persons to whom the Software is furnished to do so, subject to
9
+ the following conditions:
10
+
11
+ The above copyright notice and this permission notice shall be
12
+ included in all copies or substantial portions of the Software.
13
+
14
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
15
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
16
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
17
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
18
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
19
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
20
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
data/R/matrix.R ADDED
@@ -0,0 +1,167 @@
1
+ library('som')
2
+
3
+ SENT.norm <- function(feats, feat.weights = NULL){
4
+ s = as.matrix(feats) %*% matrix(1,nrow=dim(feats)[2],ncol=1);
5
+ s = matrix(100/s,nrow=length(s),ncol=dim(feats)[2]);
6
+ feats.norm = feats * s;
7
+ rm(s);
8
+
9
+ feats.norm[is.na(feats.norm)] = 0
10
+
11
+ if (!is.null(feat.weights)){
12
+ feats.norm = feats.norm * matrix(abs(feat.weights),ncol=length(feat.weights),nrow=dim(feats.norm)[1],byrow=T)
13
+ }
14
+
15
+ feats.norm;
16
+ }
17
+
18
+ SENT.prepare.matrix <- function(file.input, file.output, file.dict= NULL){
19
+ feats = read.table(file.input, sep="\t", header=T, row.names=1,check.names=FALSE);
20
+
21
+ if (!is.null(file.dict)){
22
+ feats.weights = as.matrix(read.table(file=file.dict, sep="\t", row.names=1));
23
+ }else {
24
+ feats.weights = NULL;
25
+ }
26
+
27
+ good.words = apply(feats,2,sum) > 0
28
+ feats = feats[,good.words]
29
+ feats = SENT.norm(feats, feats.weights);
30
+
31
+ write.table(file=file.output, feats, sep="\t", quote=FALSE)
32
+ }
33
+
34
+ SENT.join.results <- function(prefix){
35
+ files.w <- Sys.glob(paste(prefix,'.matrix_w.*',sep=""))
36
+ files.h <- Sys.glob(paste(prefix,'.matrix_h.*',sep=""))
37
+
38
+ data.w <- NULL
39
+ for (file in files.w){
40
+ data <- read.table(file, sep="\t", header=T, row.names=1, check.names=FALSE)
41
+ if (is.null(data.w)){
42
+ data.w = data
43
+ }else{
44
+ data.w <- cbind(data.w,data)
45
+ }
46
+ }
47
+
48
+ write.table(file=paste(prefix,'.features',sep=''),t(data.w), sep="\t", quote=FALSE, row.names = F)
49
+
50
+
51
+ data.h <- NULL
52
+ for (file in files.h){
53
+ data <- read.table(file, sep="\t", header=T, row.names=1, check.names=FALSE)
54
+ if (is.null(data.h)){
55
+ data.h = data
56
+ }else{
57
+ data.h <- rbind(data.h,data)
58
+ }
59
+ }
60
+
61
+ write.table(file=paste(prefix,'.profiles',sep=''),t(data.h), sep="\t", quote=FALSE,col.names=F)
62
+ }
63
+
64
+
65
+ SENT.analyze <- function(prefix, output, clusters = NULL, num.words = 15){
66
+ profiles <- read.table(paste(prefix, '.profiles',sep=""),sep="\t", row.names=1, check.names=F);
67
+ features <- read.table(paste(prefix, '.features',sep=""),sep="\t", header=T, check.names=F);
68
+
69
+ # Assume 10 repetitions
70
+ if (is.null(clusters)){
71
+ clusters = dim(features)[1] / 10 ;
72
+ }
73
+
74
+ # Form a clustering
75
+ fdist = dist(features)
76
+ hfeatures <- hclust(fdist, method="ward");
77
+ cfeatures <- cutree(hfeatures, k=clusters);
78
+
79
+ coph <- cor(fdist,cophenetic(hfeatures));
80
+ write(coph, file = paste(output, '.cophenetic',sep=""));
81
+
82
+
83
+
84
+ # Average between clusters
85
+ profiles.merged = vector();
86
+ features.merged = vector();
87
+ for (i in levels(factor(cfeatures))){
88
+ profiles.merged = cbind(profiles.merged, apply(as.matrix(profiles[,cfeatures==i]),1,mean, trim=0.1));
89
+ features.merged = rbind(features.merged, apply(as.matrix(features[cfeatures==i,]),2,mean, trim=0.1));
90
+ }
91
+
92
+
93
+ rownames(profiles.merged) <- rownames(profiles);
94
+ colnames(features.merged) <- colnames(features);
95
+
96
+ write.table(file=paste(output,'.merged.profiles',sep=''),profiles.merged, sep="\t", quote=FALSE,col.names=F)
97
+ write.table(file=paste(output,'.merged.features',sep=''),t(features.merged), sep="\t", quote=FALSE,col.names=F)
98
+
99
+ # Hard assign genes to features
100
+ profiles.bin = profiles.merged
101
+ for (i in 1:dim(profiles.bin)[1] ){
102
+ m = sort(profiles.bin[i,],index.return = T,decreasing = T)$ix[1];
103
+ profiles.bin[i,] = 0;
104
+ profiles.bin[i,m] = 1;
105
+ }
106
+
107
+ profiles.sorted = c();
108
+ profiles.bin.sorted = c();
109
+ glabels=c();
110
+
111
+
112
+
113
+ fgroups = cfeatures[unlist(dendrapply(as.dendrogram(hfeatures), function(e) attr(e, "label")))];
114
+ flabels = sapply(seq(1,dim(features)[1]-1), function(i){ if(fgroups[i] != fgroups[i+1]){ '___'}else{''}});
115
+ flabels = c(flabels,'');
116
+
117
+ flabels[unlist(dendrapply(as.dendrogram(hfeatures), function(e) attr(e, "label")))] = flabels;
118
+
119
+ order=unique(fgroups);
120
+ for (i in order){
121
+ profiles.sorted = rbind(profiles[profiles.bin[,i]==1,], profiles.sorted);
122
+ if (sum(profiles.bin[,i]==1) == 0) next
123
+ glabels = c(rep('',sum(profiles.bin[,i]==1)-1),glabels);
124
+ glabels = c('___',glabels);
125
+ }
126
+
127
+ # Produce heatmap image
128
+ bitmap(file=paste(output,'.jpg',sep=""),type='jpeg',res=75);
129
+ heatmap(as.matrix(profiles),Rowv=NA,Colv=as.dendrogram(hfeatures),xlab="Factors from 10 factorizations", ylab="Genes", labRow=glabels, labCol=flabels, margins=c(4,4));
130
+
131
+
132
+ # Produce heatmap image for hard assignment
133
+ bitmap(file=paste(output,'.hard.jpg',sep=""),type='jpeg',res=75);
134
+ heatmap(as.matrix(profiles.sorted),Rowv=NA,Colv=as.dendrogram(hfeatures),xlab="Factors from 10 factorizations", ylab="Genes", labRow=glabels, labCol=flabels, margins=c(4,4));
135
+
136
+ dev.off();
137
+
138
+ features.merged.scores = apply(features.merged,2,function(x){
139
+ sapply(x,function(v){
140
+ (v - (sum(x) - v)/(length(x) - 1))
141
+ })
142
+ })
143
+
144
+ # a = 0.1
145
+ # features.merge.specificity = apply(features.merged, 2, function(x){ sapply(x, function(v){ v / mean(x)})})
146
+ # features.merge.importance = apply(features.merged, 1, function(x){ sapply(x, function(v){ v / mean(x)})})
147
+ # features.scores = a * t(features.merge.importance) + (1-a) * features.merge.specificity
148
+
149
+
150
+
151
+
152
+
153
+ # Save Group Genes and Words
154
+ g = 1
155
+ for (i in order){
156
+ genes = rownames(profiles)[profiles.bin[,i] == 1];
157
+ cat(file=paste(output,g,'genes',sep="."),genes,sep="\n");
158
+ words = names(sort(features.merged.scores[i,],decreasing=T))[1:num.words];
159
+ ##features.t.test = apply(features, 2, function(x){ t.test(x[cfeatures == i],x[cfeatures != i])})
160
+ ##words = names(sort(sapply(features.t.test,function(x) { x$p.value}),decreasing=F))[1:num.words]
161
+ #words = names(sort(features.scores[i,],decreasing=T))[1:num.words]
162
+ cat(file=paste(output,g,'words',sep="."),words,sep="\n");
163
+ g = g + 1
164
+ }
165
+ }
166
+
167
+
data/README.rdoc ADDED
@@ -0,0 +1,23 @@
1
+ = rbbt-sent
2
+
3
+ Description goes here.
4
+
5
+ ===
6
+
7
+ Install R som package
8
+ Install ghostscript
9
+
10
+ == Note on Patches/Pull Requests
11
+
12
+ * Fork the project.
13
+ * Make your feature addition or bug fix.
14
+ * Add tests for it. This is important so I don't break it in a
15
+ future version unintentionally.
16
+ * Commit, do not mess with rakefile, version, or history.
17
+ (if you want to have your own version, that is fine but
18
+ bump version in a commit by itself I can ignore when I pull)
19
+ * Send me a pull request. Bonus points for topic branches.
20
+
21
+ == Copyright
22
+
23
+ Copyright (c) 2009 Miguel Vazquez. See LICENSE for details.
data/bin/sent_config ADDED
@@ -0,0 +1,193 @@
1
+ #!/usr/bin/ruby
2
+
3
+ require 'simpleconsole'
4
+
5
+ begin
6
+ require 'sent'
7
+ rescue
8
+ $noconfig = true
9
+ end
10
+
11
+ $USAGE =<<EOT
12
+ #{__FILE__} <action> [<subaction>] [--force] [--organism <org>]
13
+ actions:
14
+ * configure: Set paths for data, work, and tmp directories
15
+
16
+ * install:
17
+ * analysis: Install configuration to perform analysis
18
+
19
+ * update:
20
+ * metadocs: Generate metadocs for all organisms
21
+
22
+ * init:
23
+ * webservice:
24
+ * www:
25
+
26
+
27
+
28
+ EOT
29
+
30
+ class Controller < SimpleConsole::Controller
31
+
32
+ params :bool => {:f => :force},
33
+ :string => {:o => :organism, :h => :host, :p => :port}
34
+
35
+ def init
36
+
37
+ @host = params[:host]
38
+ @port = params[:port]
39
+ render :action => params[:id]
40
+
41
+ end
42
+
43
+ def default
44
+ render :action => :usage
45
+ end
46
+
47
+ def help
48
+ render :action => :usage
49
+ end
50
+
51
+ def install
52
+ raise "Run #{__FILE__} configure first to configure sent" if $noconfig
53
+
54
+ case params[:id]
55
+ when "analysis"
56
+ @tasks = %w(analysis)
57
+ when nil
58
+ redirect_to :action => :help, :id => :install
59
+ else
60
+ @tasks = [params[:id]]
61
+ end
62
+
63
+ $force = true if params[:force]
64
+ $org = params[:organism] if params[:organism]
65
+
66
+ end
67
+
68
+ def update
69
+ raise "Run #{__FILE__} configure first to configure rbbt" if $noconfig
70
+
71
+ case params[:id]
72
+ when "metadocs"
73
+ @location = File.join(Sent.datadir,'analysis')
74
+ else
75
+ redirect_to :action => :help, :id => :update
76
+ end
77
+
78
+ $force = true if params[:force]
79
+ $org = params[:organism] if params[:organism]
80
+
81
+ end
82
+
83
+ def configure
84
+ end
85
+
86
+ end
87
+
88
+ class View < SimpleConsole::View
89
+ def usage
90
+ puts $USAGE
91
+ end
92
+
93
+ def install
94
+ require 'rake'
95
+ load File.join(Sent.rootdir, 'tasks/install.rake')
96
+
97
+ @tasks.each{|t|
98
+ puts "Invoking #{ t }"
99
+ Rake::Task[t].invoke
100
+ }
101
+ end
102
+
103
+ def update
104
+ require 'rake'
105
+
106
+ puts "Changing directory to #{@location}"
107
+ chdir @location
108
+
109
+ load "./Rakefile"
110
+
111
+ Rake::Task['update'].invoke
112
+ end
113
+
114
+
115
+ def configure
116
+
117
+ defaultdir = File.join(ENV['HOME'],'sent')
118
+
119
+ workdir = File.join(defaultdir, 'work')
120
+ tmpdir = File.join(defaultdir, 'tmp')
121
+ datadir = File.join(defaultdir, 'data')
122
+
123
+ puts "Please indicate where you wish to place the data directories"
124
+ puts
125
+
126
+ puts
127
+ puts "* work Directory: This directory will hold downloads, from PubMed,
128
+ Entrez and other, for local store. It might grow considerably."
129
+ print "[#{ workdir }]? "
130
+ input = STDIN.gets
131
+ workdir = input if input =~ /\w/
132
+
133
+ puts
134
+ puts "* Tmp Directory: Temporary files."
135
+ print "[#{ tmpdir }]? "
136
+ input = STDIN.gets
137
+ tmpdir = input if input =~ /\w/
138
+
139
+ puts
140
+ puts "* Data Directory: Holds data from organisism, databases, third party software, etc."
141
+ print "[#{ datadir }]? "
142
+ input = STDIN.gets
143
+ datadir = input if input =~ /\w/
144
+
145
+
146
+
147
+ fout = File.open(File.join(ENV['HOME'], '.sent'),'w')
148
+ fout.puts "workdir: #{workdir}"
149
+ fout.puts "tmpdir: #{tmpdir}"
150
+ fout.puts "datadir: #{datadir}"
151
+ fout.close
152
+
153
+ end
154
+
155
+ def webservice
156
+
157
+ FileUtils.cd File.join(Sent.rootdir, 'webservice/bin')
158
+ require 'sentWS'
159
+
160
+ host = @host || `hostname`.chomp.strip + '.' + `hostname -d`.chomp.strip
161
+ port = @port || '8182'
162
+
163
+ puts "Starting Server in #{ host }:#{ port }"
164
+ server = SentWS.new("Sent", "Sent Web Server",host, port, Sent.workdir)
165
+
166
+ FileUtils.mkdir_p File.join(Sent.rootdir, '/webservice/wsdl/') unless File.exist? File.join(Sent.rootdir, '/webservice/wsdl/')
167
+ Open.write(File.join(Sent.rootdir, '/webservice/wsdl/SentWS.wsdl'), server.wsdl)
168
+
169
+ trap('INT') { server.abort_jobs; server.shutdown }
170
+ server.start
171
+
172
+ end
173
+
174
+ def www
175
+ FileUtils.cd File.join(Sent.rootdir, 'merb')
176
+
177
+ FileUtils.mkdir_p 'cache' unless File.exist? 'cache'
178
+ FileUtils.mkdir_p 'public/tmp' unless File.exist? 'public/tmp'
179
+
180
+ host = @host || `hostname`.chomp.strip + '.' + `hostname -d`.chomp.strip
181
+ port = @port || '8181'
182
+
183
+ require 'merb-core'
184
+ Merb.start("-a mongrel -e production -p #{ port } -h #{ host }".split)
185
+
186
+
187
+ end
188
+
189
+ end
190
+
191
+ SimpleConsole::Application.run(ARGV, Controller, View)
192
+
193
+
@@ -0,0 +1,139 @@
1
+ require 'sent'
2
+ require 'sent/main'
3
+ require 'rbbt/sources/organism'
4
+ require 'progress-monitor'
5
+
6
+
7
+ $list = ENV['list']
8
+ $kstart = ENV['kstart'] || ENV['k']
9
+ $kend = ENV['kend'] || $kstart
10
+
11
+ rule (/summary\/(.*)/) => lambda {|n| n.sub(/summary/,'NMF') } do |t|
12
+ nmf = t.name.sub(/summary/,'NMF')
13
+
14
+ Sent.analyze(nmf, t.name)
15
+ FileUtils.touch t.name
16
+ end
17
+
18
+ rule (/NMF\/(.*)/) => lambda {|n| n.sub(/NMF/,'matrices') } do |t|
19
+ matrix = t.name.sub(/NMF/,'matrices')
20
+
21
+ k = $kstart
22
+ if $kstart < $kend
23
+ best = 0
24
+ ccc = Sent.CCC(matrix, $kstart, $kend)
25
+ ccc.each_with_index{|v,i|
26
+ if v.to_i > best
27
+ k = $kstart.to_i + i
28
+ end
29
+ }
30
+ end
31
+
32
+ Sent.NMF(matrix, t.name, k.to_i, 10)
33
+ FileUtils.touch t.name
34
+ end
35
+
36
+ rule (/matrices\/(.*)/) => lambda {|n| n.sub(/matrices/,'metadocs') } do |t|
37
+ metadocs = t.name.sub(/matrices/,'metadocs')
38
+
39
+ list = nil
40
+ list = Open.read($list).collect{|l| l.chomp} if $list
41
+
42
+ Sent.matrix(metadocs, t.name, list)
43
+ end
44
+
45
+ rule (/metadocs\/(.*)/) => lambda {|n| n.sub(/metadocs/,'associations') } do |t|
46
+ assocfile = t.name.sub(/metadocs/,'associations')
47
+
48
+ Sent.metadocs(assocfile, t.name)
49
+ end
50
+
51
+ rule(/associations\/(.*)_text/) do |t|
52
+ org = File.basename(t.name).sub(/_text/,'')
53
+
54
+ ner = Organism.ner(org, :rner)
55
+ norm = Organism.norm(org)
56
+ pmids = Organism.literature(org)
57
+
58
+ fout = File.open(t.name, 'w')
59
+ chunks = pmids.chunk(100)
60
+
61
+ Progress.monitor("Finding gene-article associations in text", 1000)
62
+ chunks.each{|chunk|
63
+ PubMed.get_article(chunk).each{|pmid, article|
64
+ text = article.text
65
+
66
+ mentions = ner.extract(text)
67
+
68
+ Progress.monitor("Resolving mentions", 1000)
69
+ codes = mentions.collect{|mention|
70
+ matches = norm.match(mention)
71
+ norm.select(matches,mention,text)
72
+ }.flatten.uniq.sort
73
+
74
+ codes.each{|code|
75
+ fout.puts "#{ code }\t#{pmid}"
76
+ }
77
+
78
+ }
79
+ }
80
+ fout.close
81
+ end
82
+
83
+ rule (/associations\/(.*)/) => lambda{|n| n + '_text'} do |t|
84
+ org = File.basename(t.name)
85
+
86
+ fout = File.open(t.name, 'w')
87
+ fout.write Open.read(t.name + '_text')
88
+
89
+ associations = Organism.gene_literature(org)
90
+ associations.each{|gene, pmids|
91
+ pmids.each{|pmid|
92
+ fout.puts "#{ gene }\t#{pmid}"
93
+ }
94
+ }
95
+
96
+ associations = Organism.gene_literature_go(org)
97
+ associations.each{|gene, pmids|
98
+ pmids.each{|pmid|
99
+ fout.puts "#{ gene }\t#{pmid}"
100
+ }
101
+ }
102
+ fout.close
103
+
104
+ name = Organism.name(org)
105
+ supported_ids = Organism.supported_ids(org, :examples => true)
106
+ associations = Open.to_hash(t.name, :flatten => true)
107
+
108
+ description =<<-EOT
109
+ Name: #{ name }
110
+ Organism: #{ name }
111
+ Description: #{associations.values.flatten.length} associations for #{associations.keys.length} genes and #{associations.values.flatten.uniq.length} articles
112
+ ID Format: #{supported_ids.collect{|p| "#{ p[0] } (#{ p[1] })"}.join(", ")}
113
+ EOT
114
+
115
+ Open.write(t.name + '.description', description)
116
+ end
117
+
118
+ task 'clean' do
119
+ FileUtils.rm Dir.glob("associations/*")
120
+ end
121
+
122
+ task 'all' do
123
+ Organism.all.each{|org|
124
+ `rake metadocs/#{ org }`
125
+ }
126
+ end
127
+
128
+ task 'update' do
129
+ if $org
130
+ FileUtils.rm Dir.glob("**/#{$org}.*") if $force
131
+ Rake::Task["metadocs/#{$org}"].invoke
132
+ else
133
+ Rake::Task['clean'].invoke if $force
134
+ Rake::Task['all'].invoke
135
+ end
136
+ end
137
+
138
+
139
+
data/lib/sent/main.rb ADDED
@@ -0,0 +1,328 @@
1
+ require 'rbbt/sources/pubmed'
2
+ require 'rbbt/util/misc'
3
+ require 'rbbt/util/open'
4
+ require 'rbbt/bow/bow'
5
+ require 'rbbt/bow/dictionary'
6
+
7
+ require 'soap/wsdlDriver'
8
+ require 'stemmer'
9
+ require 'open4'
10
+ require 'progress-monitor'
11
+ require 'yaml'
12
+
13
+ # Produce Stem lists, used by the Web Service
14
+ class String
15
+
16
+ alias old_stem stem
17
+
18
+ def self.reset_stem_list
19
+ @@stem_dictionary = Hash.new
20
+ end
21
+
22
+ # Extends the stem functionality so that is generates a dictionary of
23
+ # stems. For each stem a list of words that reduce to it.
24
+ def stem
25
+ res = old_stem
26
+ @@stem_dictionary[res] ||= Hash.new
27
+ @@stem_dictionary[res][self] ||= 1
28
+ res
29
+ end
30
+
31
+ # Returns the dictionary of recorded stems.
32
+ def self.stem_list(dictionary = nil)
33
+ stem_list = Hash.new
34
+ @@stem_dictionary.each{|k,l|
35
+ next if dictionary && !dictionary.include?(k)
36
+ stem_list[k] = l.keys
37
+ }
38
+ stem_list
39
+ end
40
+
41
+ reset_stem_list
42
+ end
43
+
44
+
45
+ module Sent
46
+
47
+ class NoGenesError < StandardError; end
48
+ class ProcessAbortedError < StandardError; end
49
+ class WSError < StandardError; end
50
+
51
+ def self.run_R(command)
52
+ pid, stdin, stdout, stderr = Open4::popen4 "R --vanilla --slave"
53
+ stdin.write "source('#{File.join(Sent.rdir,'matrix.R')}');\n"
54
+ stdin.write "#{ command };\n"
55
+ stdin.close
56
+
57
+ Process.wait pid
58
+ raise ProcessAborted, "Error in R process" if $?.exitstatus != 0
59
+ result = stdout.read + stderr.read
60
+ stdout.close
61
+ stderr.close
62
+
63
+ puts result if result != ""
64
+ result
65
+ end
66
+
67
+ def self.metadocs(assocfile, output, low=0.001, hi=0.65, max=3000)
68
+
69
+ associations = Open.to_hash(assocfile, :flatten => true, :sep => "\t|,")
70
+
71
+ dict = Dictionary::TF_IDF.new
72
+
73
+ String.reset_stem_list
74
+
75
+ Progress.monitor("Building Dictionary for #{File.basename(output)}", 1000)
76
+ associations.each{|gene, pmids|
77
+ text = PubMed.get_article(pmids).collect{|p| p[1].text}.join("\n")
78
+ dict.add(BagOfWords.count(text.bigrams))
79
+ }
80
+
81
+ # At least 3 genes must have a word to be chosen
82
+ hard_min = 3 * 100 / associations.keys.length
83
+ hi = hard_min if hi < hard_min
84
+
85
+ d = dict.weights(:low => low, :hi => hi, :limit => max)
86
+ Open.write(output + '.dict', d.sort.collect{|p| p.join("\t")}.join("\n"))
87
+ terms = d.keys.sort
88
+
89
+ fout = File.open(output, 'w')
90
+ fout.puts("\t" + terms.join("\t"))
91
+
92
+ Progress.monitor("Building Metadoc for #{File.basename(output)}", 1000)
93
+ associations.each{|gene, pmids|
94
+ text = PubMed.get_article(pmids).collect{|p| p[1].text}.join("\n")
95
+ fout.puts(([gene] + BagOfWords.features(text, terms)).join("\t"))
96
+ }
97
+ fout.close
98
+
99
+ Open.write(output + '.stems', String.stem_list(terms.collect{|p| p.split(/ /)}.flatten.uniq).collect{|k,v| "#{ k }\t#{v.join("\t")}"}.join("\n"))
100
+ end
101
+
102
+ def self.matrix(metadocs, output, list=nil)
103
+ list ||= []
104
+
105
+ if list.empty?
106
+ FileUtils.cp metadocs, output
107
+ else
108
+ `head -n 1 #{ metadocs } > #{ output }`
109
+ `grep '^\\(#{list.join('\\|')}\\)[[:space:]]' #{ metadocs } >> #{output}`
110
+ raise Sent::NoGenesError, "No Genes Matched" if $? != 0
111
+ end
112
+
113
+ dict = metadocs + '.dict'
114
+ run_R("SENT.prepare.matrix('#{ output }', '#{ output }', '#{metadocs + '.dict'}')")
115
+ end
116
+
117
+
118
+ @@bionmf_wsdl = "http://bionmf.dacya.ucm.es/WebService/BioNMFWS.wsdl"
119
+ def self.NMF(matrix, out, k, executions = 10)
120
+ driver = SOAP::WSDLDriverFactory.new( @@bionmf_wsdl).create_rpc_driver
121
+
122
+ # Upload matrix
123
+ nmf_matrix = driver.upload_matrix(
124
+ File.open(matrix).read, # matrix
125
+ false, # binary
126
+ true, # column labels
127
+ true, # row labels
128
+ true, # transpose
129
+ "No", # positive
130
+ "No", # normalization
131
+ "matrix") # Suggested name
132
+ # Send several executions in parallel
133
+ while !driver.done(nmf_matrix)
134
+ sleep(5)
135
+ end
136
+
137
+ if driver.error(nmf_matrix)
138
+ error = driver.messages(nmf_matrix).join("\n")
139
+ raise "Error pre-processing matrix!" + driver.messages(nmf_matrix).join("\n")
140
+ end
141
+
142
+ threads = []
143
+ error = nil
144
+ executions.times{|i|
145
+ threads << Thread.new(i){ |num|
146
+ times = 3
147
+ begin
148
+
149
+ job_id = driver.standardNMF(
150
+ nmf_matrix, # Matrix job
151
+ "Standard", # Algorithm
152
+ k, # Factor Start
153
+ k, # Factor End
154
+ 1, # Runs
155
+ 2000, # Iterations
156
+ 40, # Stop criteria
157
+ 0, # Not used (nsnmf smoothness)
158
+ false, # extra info
159
+ '') # Suggested name
160
+
161
+ while !driver.done(job_id)
162
+ sleep(5)
163
+ end
164
+
165
+ if driver.error(job_id)
166
+ error = driver.messages(job_id).join("\n")
167
+ raise "Error in NMF" + driver.messages(job_id).join("\n")
168
+ end
169
+
170
+ results = driver.results(job_id)
171
+ fw = File.open(out + ".matrix_w.#{num}",'w')
172
+ fw.write(driver.result(results[0]).sub(/\t(.*)\t$/,'\1'))
173
+ fw.close
174
+ fh = File.open(out + ".matrix_h.#{num}",'w')
175
+ fh.write(driver.result(results[1]).sub(/\t(.*)\t$/,'\1'))
176
+ fh.close
177
+ driver.clean(job_id)
178
+ rescue Sent::ProcessAbortedError
179
+ puts "Process aborted for #{ num }"
180
+ driver.abort(job_id)
181
+ rescue Timeout::Error
182
+ if times > 0
183
+ times -= 1
184
+ sleep 2
185
+ retry
186
+ else
187
+ raise Sent::ProcessAbortedError, "NMF Execution #{ num } timed out"
188
+ end
189
+ rescue Exception
190
+ puts $!.message
191
+ if times > 0
192
+ times -= 1
193
+ puts "Retrying thread #{ num }"
194
+ retry
195
+ else
196
+
197
+ puts "NMF Execution #{ num } Produced Exception"
198
+ puts $!.class
199
+ puts $!.message
200
+ puts $!.backtrace
201
+ raise Sent::ProcessAbortedError, "NMF Execution #{ num } Produced Exception"
202
+ end
203
+ ensure
204
+ Thread.exit
205
+ end
206
+ }
207
+ sleep 1
208
+
209
+ }
210
+
211
+ # Allow threads to be aborted
212
+ aborted = false
213
+ old_int = Signal.trap("INT") do
214
+ STDERR.puts "Killing threads"
215
+ threads.each{|t| t.raise Sent::ProcessAbortedError, "Process Aborted"}
216
+ aborted = true
217
+ end
218
+
219
+ threads.each { |aThread| aThread.join }
220
+
221
+ Signal.trap("INT", old_int)
222
+ driver.clean(nmf_matrix)
223
+
224
+ if aborted
225
+ raise Sent::ProcessAbortedError, "Process Aborted"
226
+ end
227
+
228
+ if error
229
+ raise Exception, "Error in NMF:\n" + error
230
+ end
231
+
232
+ run_R("SENT.join.results('#{ out }')")
233
+
234
+ FileUtils.rm Dir.glob(out + '.matrix_*.*')
235
+ end
236
+
237
+ def self.CCC(matrix, kstart, kend)
238
+ raise "Error in range: #{ kstart } to #{ kend }" if kstart >= kend
239
+
240
+ driver = SOAP::WSDLDriverFactory.new( @@bionmf_wsdl).create_rpc_driver
241
+
242
+ # Prepare matrix for processing
243
+ nmf_matrix = driver.upload_matrix(File.open(matrix).read)
244
+ driver.preprocess(nmf_matrix,1,"No","No", true, true)
245
+
246
+ job_id = driver.sample_classification(nmf_matrix,kstart.to_i,kend.to_i,10)
247
+
248
+ aborted = false
249
+ old_int = Signal.trap("INT") do
250
+ puts "Aborting bestK process"
251
+ driver.abort(job_id)
252
+ aborted = true
253
+ end
254
+
255
+ while (status = driver.status(job_id)) == 0
256
+ sleep(5)
257
+ end
258
+
259
+ driver.clean_matrix(nmf_matrix)
260
+ Signal.trap("INT", old_int)
261
+
262
+ if aborted
263
+ raise Sent::ProcessAbortedError, "Process Aborted"
264
+ end
265
+
266
+ if status == -1
267
+ raise Sent::WSError, "Error processing matrix:\n" + driver.info(job_id)
268
+ end
269
+
270
+ results = driver.results(job_id)
271
+ text = driver.get_result(results[0])
272
+ text.split(/\n/s).last.split(/\t/)
273
+ end
274
+
275
+ def self.analyze(prefix, output, clusters = nil, num_words = 15)
276
+
277
+ FileUtils.rm Dir.glob(output + '*.words') + Dir.glob(output + '*.genes')
278
+ run_R("SENT.analyze('#{ prefix }', '#{ output }', '#{clusters}', '#{num_words}')")
279
+ words = Dir.glob(output + '*.words').sort.collect{|f| Open.read(f).split(/\n/)}
280
+ genes = Dir.glob(output + '*.genes').sort.collect{|f| Open.read(f).split(/\n/)}
281
+
282
+ groups = []
283
+ words.zip(genes).each{|p|
284
+ groups << {:words => p[0], :genes => p[1]}
285
+ }
286
+
287
+ Open.write(output + '.summary', groups.to_yaml)
288
+ end
289
+
290
+ def self.literature_index(pmids, outfile)
291
+
292
+ index = Ferret::Index::Index.new(:path => outfile)
293
+
294
+ index.field_infos.add_field(:title, :index => :yes, :boost => 0.67)
295
+ index.field_infos.add_field(:abstract, :index => :yes, :boost => 0.33)
296
+
297
+ Progress.monitor("Building index for #{pmids.length} articles")
298
+ pmids.each{|pmid|
299
+ begin
300
+ article = PubMed.get_article(pmid)
301
+ abstract = article.abstract
302
+ title = article.title
303
+
304
+ abstract_content = BagOfWords.terms(abstract).collect{|w,n| (1..n).collect{ w }}.flatten.join(" ")
305
+ title_content = BagOfWords.terms(title).collect{|w,n| (1..n).collect{ w }}.flatten.join(" ")
306
+
307
+ index << {:id => pmid, :abstract => abstract_content, :name => title_content}
308
+ rescue Exception
309
+ puts $!.backtrace
310
+ puts $!.message
311
+ end
312
+
313
+ }
314
+ index.close
315
+ end
316
+
317
+ def self.search_index(words, index)
318
+ index = Ferret::Index::Index.new(:path => index)
319
+
320
+ ranks = []
321
+ index.search_each("#{ words.collect{|w| w.stem}.join(" ") }", :limit => 8000) do |id,score|
322
+ next unless score > 0.0001
323
+ ranks << [index[id][:id],score]
324
+ end
325
+
326
+ ranks
327
+ end
328
+ end
data/lib/sent.rb ADDED
@@ -0,0 +1,71 @@
1
+ $:.unshift(File.dirname(__FILE__)) unless
2
+ $:.include?(File.dirname(__FILE__)) || $:.include?(File.expand_path(File.dirname(__FILE__)))
3
+
4
+ require 'yaml'
5
+ require 'fileutils'
6
+ require 'stemmer'
7
+ require 'ferret'
8
+
9
+ module Sent
10
+
11
+ class NoConfigError < StandardError; end
12
+
13
+ @@rootdir = File.dirname(File.dirname(__FILE__))
14
+ @@datadir = @@workdir = @@tmpdir = nil
15
+
16
+ def self.load_config
17
+ if File.exist?File.join(@@rootdir, 'sent.config')
18
+ config = YAML.load_file(File.join(@@rotdir, 'sent.config'))
19
+ if config.is_a? Hash
20
+ @@datadir = config['datadir'] if config['datadir']
21
+ @@workdir = config['workdir'] if config['workdir']
22
+ @@tmpdir = config['tmpdir'] if config['tmpdir']
23
+ end
24
+ end
25
+
26
+
27
+
28
+ if File.exist?(File.join(ENV['HOME'], '.sent'))
29
+ config = YAML.load_file(File.join(ENV['HOME'], '.sent') )
30
+ if config.is_a? Hash
31
+ @@datadir = config['datadir'] if config['datadir']
32
+ @@workdir = config['workdir'] if config['workdir']
33
+ @@tmpdir = config['tmpdir'] if config['tmpdir']
34
+ end
35
+ end
36
+
37
+ if @@datadir.nil? || @@workdir.nil? || @@tmpdir.nil?
38
+ raise Sent::NoConfig, "sent not configured. Edit #{File.join(@@rootdir, 'sent.config')} or $HOME/.sent"
39
+ end
40
+
41
+
42
+ FileUtils.mkdir_p @@datadir unless File.exist? @@datadir
43
+ FileUtils.mkdir_p @@workdir unless File.exist? @@workdir
44
+ FileUtils.mkdir_p @@tmpdir unless File.exist? @@tmpdir
45
+
46
+ end
47
+
48
+ def self.datadir
49
+ @@datadir
50
+ end
51
+ def self.workdir
52
+ @@workdir
53
+ end
54
+ def self.tmpdir
55
+ @@tmpdir
56
+ end
57
+
58
+ def self.rootdir
59
+ @@rootdir
60
+ end
61
+
62
+
63
+ def self.rdir
64
+ File.join(@@rootdir, 'R')
65
+ end
66
+
67
+ self.load_config
68
+ end
69
+
70
+
71
+
@@ -0,0 +1,17 @@
1
+ require 'sent'
2
+
3
+ $datadir = Sent.datadir
4
+ $scriptdir = File.join(Sent.rootdir, '/install_scripts')
5
+
6
+ task 'analysis' do
7
+ directory = "#{$datadir}/analysis"
8
+ FileUtils.mkdir_p directory
9
+ %w(Rakefile).each{|f|
10
+ FileUtils.cp_r File.join($scriptdir, "analysis/#{ f }"), directory
11
+ }
12
+
13
+ %w(associations metadocs matrices NMF summary).each{|d|
14
+ FileUtils.mkdir_p File.join(directory, d)
15
+ }
16
+ end
17
+
data/test/helper.rb ADDED
@@ -0,0 +1,9 @@
1
+ require 'rubygems'
2
+ require 'test/unit'
3
+
4
+ $LOAD_PATH.unshift(File.join(File.dirname(__FILE__), '..', 'lib'))
5
+ $LOAD_PATH.unshift(File.dirname(__FILE__))
6
+ require 'sent'
7
+
8
+ class Test::Unit::TestCase
9
+ end
metadata ADDED
@@ -0,0 +1,133 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: rbbt-sent
3
+ version: !ruby/object:Gem::Version
4
+ version: 1.0.0
5
+ platform: ruby
6
+ authors:
7
+ - Miguel Vazquez
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+
12
+ date: 2009-11-09 00:00:00 +01:00
13
+ default_executable: sent_config
14
+ dependencies:
15
+ - !ruby/object:Gem::Dependency
16
+ name: rake
17
+ type: :runtime
18
+ version_requirement:
19
+ version_requirements: !ruby/object:Gem::Requirement
20
+ requirements:
21
+ - - ">="
22
+ - !ruby/object:Gem::Version
23
+ version: "0"
24
+ version:
25
+ - !ruby/object:Gem::Dependency
26
+ name: simpleconsole
27
+ type: :runtime
28
+ version_requirement:
29
+ version_requirements: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - ">="
32
+ - !ruby/object:Gem::Version
33
+ version: "0"
34
+ version:
35
+ - !ruby/object:Gem::Dependency
36
+ name: rbbt
37
+ type: :runtime
38
+ version_requirement:
39
+ version_requirements: !ruby/object:Gem::Requirement
40
+ requirements:
41
+ - - ">="
42
+ - !ruby/object:Gem::Version
43
+ version: "0"
44
+ version:
45
+ - !ruby/object:Gem::Dependency
46
+ name: ferret
47
+ type: :runtime
48
+ version_requirement:
49
+ version_requirements: !ruby/object:Gem::Requirement
50
+ requirements:
51
+ - - ">="
52
+ - !ruby/object:Gem::Version
53
+ version: "0"
54
+ version:
55
+ - !ruby/object:Gem::Dependency
56
+ name: stemmer
57
+ type: :runtime
58
+ version_requirement:
59
+ version_requirements: !ruby/object:Gem::Requirement
60
+ requirements:
61
+ - - ">="
62
+ - !ruby/object:Gem::Version
63
+ version: "0"
64
+ version:
65
+ - !ruby/object:Gem::Dependency
66
+ name: progress-monitor
67
+ type: :runtime
68
+ version_requirement:
69
+ version_requirements: !ruby/object:Gem::Requirement
70
+ requirements:
71
+ - - ">="
72
+ - !ruby/object:Gem::Version
73
+ version: "0"
74
+ version:
75
+ - !ruby/object:Gem::Dependency
76
+ name: open4
77
+ type: :runtime
78
+ version_requirement:
79
+ version_requirements: !ruby/object:Gem::Requirement
80
+ requirements:
81
+ - - ">="
82
+ - !ruby/object:Gem::Version
83
+ version: "0"
84
+ version:
85
+ description: Use literature mining to find semantic features to describe clusers of genes
86
+ email: miguel.vazquez@fdi.ucm.es
87
+ executables:
88
+ - sent_config
89
+ extensions: []
90
+
91
+ extra_rdoc_files:
92
+ - LICENSE
93
+ - README.rdoc
94
+ files:
95
+ - R/matrix.R
96
+ - bin/sent_config
97
+ - install_scripts/analysis/Rakefile
98
+ - lib/sent.rb
99
+ - lib/sent/main.rb
100
+ - tasks/install.rake
101
+ - test/helper.rb
102
+ - LICENSE
103
+ - README.rdoc
104
+ has_rdoc: true
105
+ homepage: http://github.com/mikisvaz/rbbt-sent
106
+ licenses: []
107
+
108
+ post_install_message:
109
+ rdoc_options:
110
+ - --charset=UTF-8
111
+ require_paths:
112
+ - lib
113
+ required_ruby_version: !ruby/object:Gem::Requirement
114
+ requirements:
115
+ - - ">="
116
+ - !ruby/object:Gem::Version
117
+ version: "0"
118
+ version:
119
+ required_rubygems_version: !ruby/object:Gem::Requirement
120
+ requirements:
121
+ - - ">="
122
+ - !ruby/object:Gem::Version
123
+ version: "0"
124
+ version:
125
+ requirements: []
126
+
127
+ rubyforge_project:
128
+ rubygems_version: 1.3.5
129
+ signing_key:
130
+ specification_version: 3
131
+ summary: Semantic Features in Text
132
+ test_files:
133
+ - test/helper.rb