rbbt-sent 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/LICENSE ADDED
@@ -0,0 +1,20 @@
1
+ Copyright (c) 2009 Miguel Vazquez
2
+
3
+ Permission is hereby granted, free of charge, to any person obtaining
4
+ a copy of this software and associated documentation files (the
5
+ "Software"), to deal in the Software without restriction, including
6
+ without limitation the rights to use, copy, modify, merge, publish,
7
+ distribute, sublicense, and/or sell copies of the Software, and to
8
+ permit persons to whom the Software is furnished to do so, subject to
9
+ the following conditions:
10
+
11
+ The above copyright notice and this permission notice shall be
12
+ included in all copies or substantial portions of the Software.
13
+
14
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
15
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
16
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
17
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
18
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
19
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
20
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
data/R/matrix.R ADDED
@@ -0,0 +1,167 @@
1
+ library('som')
2
+
3
+ SENT.norm <- function(feats, feat.weights = NULL){
4
+ s = as.matrix(feats) %*% matrix(1,nrow=dim(feats)[2],ncol=1);
5
+ s = matrix(100/s,nrow=length(s),ncol=dim(feats)[2]);
6
+ feats.norm = feats * s;
7
+ rm(s);
8
+
9
+ feats.norm[is.na(feats.norm)] = 0
10
+
11
+ if (!is.null(feat.weights)){
12
+ feats.norm = feats.norm * matrix(abs(feat.weights),ncol=length(feat.weights),nrow=dim(feats.norm)[1],byrow=T)
13
+ }
14
+
15
+ feats.norm;
16
+ }
17
+
18
+ SENT.prepare.matrix <- function(file.input, file.output, file.dict= NULL){
19
+ feats = read.table(file.input, sep="\t", header=T, row.names=1,check.names=FALSE);
20
+
21
+ if (!is.null(file.dict)){
22
+ feats.weights = as.matrix(read.table(file=file.dict, sep="\t", row.names=1));
23
+ }else {
24
+ feats.weights = NULL;
25
+ }
26
+
27
+ good.words = apply(feats,2,sum) > 0
28
+ feats = feats[,good.words]
29
+ feats = SENT.norm(feats, feats.weights);
30
+
31
+ write.table(file=file.output, feats, sep="\t", quote=FALSE)
32
+ }
33
+
34
+ SENT.join.results <- function(prefix){
35
+ files.w <- Sys.glob(paste(prefix,'.matrix_w.*',sep=""))
36
+ files.h <- Sys.glob(paste(prefix,'.matrix_h.*',sep=""))
37
+
38
+ data.w <- NULL
39
+ for (file in files.w){
40
+ data <- read.table(file, sep="\t", header=T, row.names=1, check.names=FALSE)
41
+ if (is.null(data.w)){
42
+ data.w = data
43
+ }else{
44
+ data.w <- cbind(data.w,data)
45
+ }
46
+ }
47
+
48
+ write.table(file=paste(prefix,'.features',sep=''),t(data.w), sep="\t", quote=FALSE, row.names = F)
49
+
50
+
51
+ data.h <- NULL
52
+ for (file in files.h){
53
+ data <- read.table(file, sep="\t", header=T, row.names=1, check.names=FALSE)
54
+ if (is.null(data.h)){
55
+ data.h = data
56
+ }else{
57
+ data.h <- rbind(data.h,data)
58
+ }
59
+ }
60
+
61
+ write.table(file=paste(prefix,'.profiles',sep=''),t(data.h), sep="\t", quote=FALSE,col.names=F)
62
+ }
63
+
64
+
65
+ SENT.analyze <- function(prefix, output, clusters = NULL, num.words = 15){
66
+ profiles <- read.table(paste(prefix, '.profiles',sep=""),sep="\t", row.names=1, check.names=F);
67
+ features <- read.table(paste(prefix, '.features',sep=""),sep="\t", header=T, check.names=F);
68
+
69
+ # Assume 10 repetitions
70
+ if (is.null(clusters)){
71
+ clusters = dim(features)[1] / 10 ;
72
+ }
73
+
74
+ # Form a clustering
75
+ fdist = dist(features)
76
+ hfeatures <- hclust(fdist, method="ward");
77
+ cfeatures <- cutree(hfeatures, k=clusters);
78
+
79
+ coph <- cor(fdist,cophenetic(hfeatures));
80
+ write(coph, file = paste(output, '.cophenetic',sep=""));
81
+
82
+
83
+
84
+ # Average between clusters
85
+ profiles.merged = vector();
86
+ features.merged = vector();
87
+ for (i in levels(factor(cfeatures))){
88
+ profiles.merged = cbind(profiles.merged, apply(as.matrix(profiles[,cfeatures==i]),1,mean, trim=0.1));
89
+ features.merged = rbind(features.merged, apply(as.matrix(features[cfeatures==i,]),2,mean, trim=0.1));
90
+ }
91
+
92
+
93
+ rownames(profiles.merged) <- rownames(profiles);
94
+ colnames(features.merged) <- colnames(features);
95
+
96
+ write.table(file=paste(output,'.merged.profiles',sep=''),profiles.merged, sep="\t", quote=FALSE,col.names=F)
97
+ write.table(file=paste(output,'.merged.features',sep=''),t(features.merged), sep="\t", quote=FALSE,col.names=F)
98
+
99
+ # Hard assign genes to features
100
+ profiles.bin = profiles.merged
101
+ for (i in 1:dim(profiles.bin)[1] ){
102
+ m = sort(profiles.bin[i,],index.return = T,decreasing = T)$ix[1];
103
+ profiles.bin[i,] = 0;
104
+ profiles.bin[i,m] = 1;
105
+ }
106
+
107
+ profiles.sorted = c();
108
+ profiles.bin.sorted = c();
109
+ glabels=c();
110
+
111
+
112
+
113
+ fgroups = cfeatures[unlist(dendrapply(as.dendrogram(hfeatures), function(e) attr(e, "label")))];
114
+ flabels = sapply(seq(1,dim(features)[1]-1), function(i){ if(fgroups[i] != fgroups[i+1]){ '___'}else{''}});
115
+ flabels = c(flabels,'');
116
+
117
+ flabels[unlist(dendrapply(as.dendrogram(hfeatures), function(e) attr(e, "label")))] = flabels;
118
+
119
+ order=unique(fgroups);
120
+ for (i in order){
121
+ profiles.sorted = rbind(profiles[profiles.bin[,i]==1,], profiles.sorted);
122
+ if (sum(profiles.bin[,i]==1) == 0) next
123
+ glabels = c(rep('',sum(profiles.bin[,i]==1)-1),glabels);
124
+ glabels = c('___',glabels);
125
+ }
126
+
127
+ # Produce heatmap image
128
+ bitmap(file=paste(output,'.jpg',sep=""),type='jpeg',res=75);
129
+ heatmap(as.matrix(profiles),Rowv=NA,Colv=as.dendrogram(hfeatures),xlab="Factors from 10 factorizations", ylab="Genes", labRow=glabels, labCol=flabels, margins=c(4,4));
130
+
131
+
132
+ # Produce heatmap image for hard assignment
133
+ bitmap(file=paste(output,'.hard.jpg',sep=""),type='jpeg',res=75);
134
+ heatmap(as.matrix(profiles.sorted),Rowv=NA,Colv=as.dendrogram(hfeatures),xlab="Factors from 10 factorizations", ylab="Genes", labRow=glabels, labCol=flabels, margins=c(4,4));
135
+
136
+ dev.off();
137
+
138
+ features.merged.scores = apply(features.merged,2,function(x){
139
+ sapply(x,function(v){
140
+ (v - (sum(x) - v)/(length(x) - 1))
141
+ })
142
+ })
143
+
144
+ # a = 0.1
145
+ # features.merge.specificity = apply(features.merged, 2, function(x){ sapply(x, function(v){ v / mean(x)})})
146
+ # features.merge.importance = apply(features.merged, 1, function(x){ sapply(x, function(v){ v / mean(x)})})
147
+ # features.scores = a * t(features.merge.importance) + (1-a) * features.merge.specificity
148
+
149
+
150
+
151
+
152
+
153
+ # Save Group Genes and Words
154
+ g = 1
155
+ for (i in order){
156
+ genes = rownames(profiles)[profiles.bin[,i] == 1];
157
+ cat(file=paste(output,g,'genes',sep="."),genes,sep="\n");
158
+ words = names(sort(features.merged.scores[i,],decreasing=T))[1:num.words];
159
+ ##features.t.test = apply(features, 2, function(x){ t.test(x[cfeatures == i],x[cfeatures != i])})
160
+ ##words = names(sort(sapply(features.t.test,function(x) { x$p.value}),decreasing=F))[1:num.words]
161
+ #words = names(sort(features.scores[i,],decreasing=T))[1:num.words]
162
+ cat(file=paste(output,g,'words',sep="."),words,sep="\n");
163
+ g = g + 1
164
+ }
165
+ }
166
+
167
+
data/README.rdoc ADDED
@@ -0,0 +1,23 @@
1
+ = rbbt-sent
2
+
3
+ Description goes here.
4
+
5
+ ===
6
+
7
+ Install R som package
8
+ Install ghostscript
9
+
10
+ == Note on Patches/Pull Requests
11
+
12
+ * Fork the project.
13
+ * Make your feature addition or bug fix.
14
+ * Add tests for it. This is important so I don't break it in a
15
+ future version unintentionally.
16
+ * Commit, do not mess with rakefile, version, or history.
17
+ (if you want to have your own version, that is fine but
18
+ bump version in a commit by itself I can ignore when I pull)
19
+ * Send me a pull request. Bonus points for topic branches.
20
+
21
+ == Copyright
22
+
23
+ Copyright (c) 2009 Miguel Vazquez. See LICENSE for details.
data/bin/sent_config ADDED
@@ -0,0 +1,193 @@
1
+ #!/usr/bin/ruby
2
+
3
+ require 'simpleconsole'
4
+
5
+ begin
6
+ require 'sent'
7
+ rescue
8
+ $noconfig = true
9
+ end
10
+
11
+ $USAGE =<<EOT
12
+ #{__FILE__} <action> [<subaction>] [--force] [--organism <org>]
13
+ actions:
14
+ * configure: Set paths for data, work, and tmp directories
15
+
16
+ * install:
17
+ * analysis: Install configuration to perform analysis
18
+
19
+ * update:
20
+ * metadocs: Generate metadocs for all organisms
21
+
22
+ * init:
23
+ * webservice:
24
+ * www:
25
+
26
+
27
+
28
+ EOT
29
+
30
+ class Controller < SimpleConsole::Controller
31
+
32
+ params :bool => {:f => :force},
33
+ :string => {:o => :organism, :h => :host, :p => :port}
34
+
35
+ def init
36
+
37
+ @host = params[:host]
38
+ @port = params[:port]
39
+ render :action => params[:id]
40
+
41
+ end
42
+
43
+ def default
44
+ render :action => :usage
45
+ end
46
+
47
+ def help
48
+ render :action => :usage
49
+ end
50
+
51
+ def install
52
+ raise "Run #{__FILE__} configure first to configure sent" if $noconfig
53
+
54
+ case params[:id]
55
+ when "analysis"
56
+ @tasks = %w(analysis)
57
+ when nil
58
+ redirect_to :action => :help, :id => :install
59
+ else
60
+ @tasks = [params[:id]]
61
+ end
62
+
63
+ $force = true if params[:force]
64
+ $org = params[:organism] if params[:organism]
65
+
66
+ end
67
+
68
+ def update
69
+ raise "Run #{__FILE__} configure first to configure rbbt" if $noconfig
70
+
71
+ case params[:id]
72
+ when "metadocs"
73
+ @location = File.join(Sent.datadir,'analysis')
74
+ else
75
+ redirect_to :action => :help, :id => :update
76
+ end
77
+
78
+ $force = true if params[:force]
79
+ $org = params[:organism] if params[:organism]
80
+
81
+ end
82
+
83
+ def configure
84
+ end
85
+
86
+ end
87
+
88
+ class View < SimpleConsole::View
89
+ def usage
90
+ puts $USAGE
91
+ end
92
+
93
+ def install
94
+ require 'rake'
95
+ load File.join(Sent.rootdir, 'tasks/install.rake')
96
+
97
+ @tasks.each{|t|
98
+ puts "Invoking #{ t }"
99
+ Rake::Task[t].invoke
100
+ }
101
+ end
102
+
103
+ def update
104
+ require 'rake'
105
+
106
+ puts "Changing directory to #{@location}"
107
+ chdir @location
108
+
109
+ load "./Rakefile"
110
+
111
+ Rake::Task['update'].invoke
112
+ end
113
+
114
+
115
+ def configure
116
+
117
+ defaultdir = File.join(ENV['HOME'],'sent')
118
+
119
+ workdir = File.join(defaultdir, 'work')
120
+ tmpdir = File.join(defaultdir, 'tmp')
121
+ datadir = File.join(defaultdir, 'data')
122
+
123
+ puts "Please indicate where you wish to place the data directories"
124
+ puts
125
+
126
+ puts
127
+ puts "* work Directory: This directory will hold downloads, from PubMed,
128
+ Entrez and other, for local store. It might grow considerably."
129
+ print "[#{ workdir }]? "
130
+ input = STDIN.gets
131
+ workdir = input if input =~ /\w/
132
+
133
+ puts
134
+ puts "* Tmp Directory: Temporary files."
135
+ print "[#{ tmpdir }]? "
136
+ input = STDIN.gets
137
+ tmpdir = input if input =~ /\w/
138
+
139
+ puts
140
+ puts "* Data Directory: Holds data from organisism, databases, third party software, etc."
141
+ print "[#{ datadir }]? "
142
+ input = STDIN.gets
143
+ datadir = input if input =~ /\w/
144
+
145
+
146
+
147
+ fout = File.open(File.join(ENV['HOME'], '.sent'),'w')
148
+ fout.puts "workdir: #{workdir}"
149
+ fout.puts "tmpdir: #{tmpdir}"
150
+ fout.puts "datadir: #{datadir}"
151
+ fout.close
152
+
153
+ end
154
+
155
+ def webservice
156
+
157
+ FileUtils.cd File.join(Sent.rootdir, 'webservice/bin')
158
+ require 'sentWS'
159
+
160
+ host = @host || `hostname`.chomp.strip + '.' + `hostname -d`.chomp.strip
161
+ port = @port || '8182'
162
+
163
+ puts "Starting Server in #{ host }:#{ port }"
164
+ server = SentWS.new("Sent", "Sent Web Server",host, port, Sent.workdir)
165
+
166
+ FileUtils.mkdir_p File.join(Sent.rootdir, '/webservice/wsdl/') unless File.exist? File.join(Sent.rootdir, '/webservice/wsdl/')
167
+ Open.write(File.join(Sent.rootdir, '/webservice/wsdl/SentWS.wsdl'), server.wsdl)
168
+
169
+ trap('INT') { server.abort_jobs; server.shutdown }
170
+ server.start
171
+
172
+ end
173
+
174
+ def www
175
+ FileUtils.cd File.join(Sent.rootdir, 'merb')
176
+
177
+ FileUtils.mkdir_p 'cache' unless File.exist? 'cache'
178
+ FileUtils.mkdir_p 'public/tmp' unless File.exist? 'public/tmp'
179
+
180
+ host = @host || `hostname`.chomp.strip + '.' + `hostname -d`.chomp.strip
181
+ port = @port || '8181'
182
+
183
+ require 'merb-core'
184
+ Merb.start("-a mongrel -e production -p #{ port } -h #{ host }".split)
185
+
186
+
187
+ end
188
+
189
+ end
190
+
191
+ SimpleConsole::Application.run(ARGV, Controller, View)
192
+
193
+
@@ -0,0 +1,139 @@
1
+ require 'sent'
2
+ require 'sent/main'
3
+ require 'rbbt/sources/organism'
4
+ require 'progress-monitor'
5
+
6
+
7
+ $list = ENV['list']
8
+ $kstart = ENV['kstart'] || ENV['k']
9
+ $kend = ENV['kend'] || $kstart
10
+
11
+ rule (/summary\/(.*)/) => lambda {|n| n.sub(/summary/,'NMF') } do |t|
12
+ nmf = t.name.sub(/summary/,'NMF')
13
+
14
+ Sent.analyze(nmf, t.name)
15
+ FileUtils.touch t.name
16
+ end
17
+
18
+ rule (/NMF\/(.*)/) => lambda {|n| n.sub(/NMF/,'matrices') } do |t|
19
+ matrix = t.name.sub(/NMF/,'matrices')
20
+
21
+ k = $kstart
22
+ if $kstart < $kend
23
+ best = 0
24
+ ccc = Sent.CCC(matrix, $kstart, $kend)
25
+ ccc.each_with_index{|v,i|
26
+ if v.to_i > best
27
+ k = $kstart.to_i + i
28
+ end
29
+ }
30
+ end
31
+
32
+ Sent.NMF(matrix, t.name, k.to_i, 10)
33
+ FileUtils.touch t.name
34
+ end
35
+
36
+ rule (/matrices\/(.*)/) => lambda {|n| n.sub(/matrices/,'metadocs') } do |t|
37
+ metadocs = t.name.sub(/matrices/,'metadocs')
38
+
39
+ list = nil
40
+ list = Open.read($list).collect{|l| l.chomp} if $list
41
+
42
+ Sent.matrix(metadocs, t.name, list)
43
+ end
44
+
45
+ rule (/metadocs\/(.*)/) => lambda {|n| n.sub(/metadocs/,'associations') } do |t|
46
+ assocfile = t.name.sub(/metadocs/,'associations')
47
+
48
+ Sent.metadocs(assocfile, t.name)
49
+ end
50
+
51
+ rule(/associations\/(.*)_text/) do |t|
52
+ org = File.basename(t.name).sub(/_text/,'')
53
+
54
+ ner = Organism.ner(org, :rner)
55
+ norm = Organism.norm(org)
56
+ pmids = Organism.literature(org)
57
+
58
+ fout = File.open(t.name, 'w')
59
+ chunks = pmids.chunk(100)
60
+
61
+ Progress.monitor("Finding gene-article associations in text", 1000)
62
+ chunks.each{|chunk|
63
+ PubMed.get_article(chunk).each{|pmid, article|
64
+ text = article.text
65
+
66
+ mentions = ner.extract(text)
67
+
68
+ Progress.monitor("Resolving mentions", 1000)
69
+ codes = mentions.collect{|mention|
70
+ matches = norm.match(mention)
71
+ norm.select(matches,mention,text)
72
+ }.flatten.uniq.sort
73
+
74
+ codes.each{|code|
75
+ fout.puts "#{ code }\t#{pmid}"
76
+ }
77
+
78
+ }
79
+ }
80
+ fout.close
81
+ end
82
+
83
+ rule (/associations\/(.*)/) => lambda{|n| n + '_text'} do |t|
84
+ org = File.basename(t.name)
85
+
86
+ fout = File.open(t.name, 'w')
87
+ fout.write Open.read(t.name + '_text')
88
+
89
+ associations = Organism.gene_literature(org)
90
+ associations.each{|gene, pmids|
91
+ pmids.each{|pmid|
92
+ fout.puts "#{ gene }\t#{pmid}"
93
+ }
94
+ }
95
+
96
+ associations = Organism.gene_literature_go(org)
97
+ associations.each{|gene, pmids|
98
+ pmids.each{|pmid|
99
+ fout.puts "#{ gene }\t#{pmid}"
100
+ }
101
+ }
102
+ fout.close
103
+
104
+ name = Organism.name(org)
105
+ supported_ids = Organism.supported_ids(org, :examples => true)
106
+ associations = Open.to_hash(t.name, :flatten => true)
107
+
108
+ description =<<-EOT
109
+ Name: #{ name }
110
+ Organism: #{ name }
111
+ Description: #{associations.values.flatten.length} associations for #{associations.keys.length} genes and #{associations.values.flatten.uniq.length} articles
112
+ ID Format: #{supported_ids.collect{|p| "#{ p[0] } (#{ p[1] })"}.join(", ")}
113
+ EOT
114
+
115
+ Open.write(t.name + '.description', description)
116
+ end
117
+
118
+ task 'clean' do
119
+ FileUtils.rm Dir.glob("associations/*")
120
+ end
121
+
122
+ task 'all' do
123
+ Organism.all.each{|org|
124
+ `rake metadocs/#{ org }`
125
+ }
126
+ end
127
+
128
+ task 'update' do
129
+ if $org
130
+ FileUtils.rm Dir.glob("**/#{$org}.*") if $force
131
+ Rake::Task["metadocs/#{$org}"].invoke
132
+ else
133
+ Rake::Task['clean'].invoke if $force
134
+ Rake::Task['all'].invoke
135
+ end
136
+ end
137
+
138
+
139
+
data/lib/sent/main.rb ADDED
@@ -0,0 +1,328 @@
1
+ require 'rbbt/sources/pubmed'
2
+ require 'rbbt/util/misc'
3
+ require 'rbbt/util/open'
4
+ require 'rbbt/bow/bow'
5
+ require 'rbbt/bow/dictionary'
6
+
7
+ require 'soap/wsdlDriver'
8
+ require 'stemmer'
9
+ require 'open4'
10
+ require 'progress-monitor'
11
+ require 'yaml'
12
+
13
+ # Produce Stem lists, used by the Web Service
14
+ class String
15
+
16
+ alias old_stem stem
17
+
18
+ def self.reset_stem_list
19
+ @@stem_dictionary = Hash.new
20
+ end
21
+
22
+ # Extends the stem functionality so that is generates a dictionary of
23
+ # stems. For each stem a list of words that reduce to it.
24
+ def stem
25
+ res = old_stem
26
+ @@stem_dictionary[res] ||= Hash.new
27
+ @@stem_dictionary[res][self] ||= 1
28
+ res
29
+ end
30
+
31
+ # Returns the dictionary of recorded stems.
32
+ def self.stem_list(dictionary = nil)
33
+ stem_list = Hash.new
34
+ @@stem_dictionary.each{|k,l|
35
+ next if dictionary && !dictionary.include?(k)
36
+ stem_list[k] = l.keys
37
+ }
38
+ stem_list
39
+ end
40
+
41
+ reset_stem_list
42
+ end
43
+
44
+
45
+ module Sent
46
+
47
+ class NoGenesError < StandardError; end
48
+ class ProcessAbortedError < StandardError; end
49
+ class WSError < StandardError; end
50
+
51
+ def self.run_R(command)
52
+ pid, stdin, stdout, stderr = Open4::popen4 "R --vanilla --slave"
53
+ stdin.write "source('#{File.join(Sent.rdir,'matrix.R')}');\n"
54
+ stdin.write "#{ command };\n"
55
+ stdin.close
56
+
57
+ Process.wait pid
58
+ raise ProcessAborted, "Error in R process" if $?.exitstatus != 0
59
+ result = stdout.read + stderr.read
60
+ stdout.close
61
+ stderr.close
62
+
63
+ puts result if result != ""
64
+ result
65
+ end
66
+
67
+ def self.metadocs(assocfile, output, low=0.001, hi=0.65, max=3000)
68
+
69
+ associations = Open.to_hash(assocfile, :flatten => true, :sep => "\t|,")
70
+
71
+ dict = Dictionary::TF_IDF.new
72
+
73
+ String.reset_stem_list
74
+
75
+ Progress.monitor("Building Dictionary for #{File.basename(output)}", 1000)
76
+ associations.each{|gene, pmids|
77
+ text = PubMed.get_article(pmids).collect{|p| p[1].text}.join("\n")
78
+ dict.add(BagOfWords.count(text.bigrams))
79
+ }
80
+
81
+ # At least 3 genes must have a word to be chosen
82
+ hard_min = 3 * 100 / associations.keys.length
83
+ hi = hard_min if hi < hard_min
84
+
85
+ d = dict.weights(:low => low, :hi => hi, :limit => max)
86
+ Open.write(output + '.dict', d.sort.collect{|p| p.join("\t")}.join("\n"))
87
+ terms = d.keys.sort
88
+
89
+ fout = File.open(output, 'w')
90
+ fout.puts("\t" + terms.join("\t"))
91
+
92
+ Progress.monitor("Building Metadoc for #{File.basename(output)}", 1000)
93
+ associations.each{|gene, pmids|
94
+ text = PubMed.get_article(pmids).collect{|p| p[1].text}.join("\n")
95
+ fout.puts(([gene] + BagOfWords.features(text, terms)).join("\t"))
96
+ }
97
+ fout.close
98
+
99
+ Open.write(output + '.stems', String.stem_list(terms.collect{|p| p.split(/ /)}.flatten.uniq).collect{|k,v| "#{ k }\t#{v.join("\t")}"}.join("\n"))
100
+ end
101
+
102
+ def self.matrix(metadocs, output, list=nil)
103
+ list ||= []
104
+
105
+ if list.empty?
106
+ FileUtils.cp metadocs, output
107
+ else
108
+ `head -n 1 #{ metadocs } > #{ output }`
109
+ `grep '^\\(#{list.join('\\|')}\\)[[:space:]]' #{ metadocs } >> #{output}`
110
+ raise Sent::NoGenesError, "No Genes Matched" if $? != 0
111
+ end
112
+
113
+ dict = metadocs + '.dict'
114
+ run_R("SENT.prepare.matrix('#{ output }', '#{ output }', '#{metadocs + '.dict'}')")
115
+ end
116
+
117
+
118
+ @@bionmf_wsdl = "http://bionmf.dacya.ucm.es/WebService/BioNMFWS.wsdl"
119
+ def self.NMF(matrix, out, k, executions = 10)
120
+ driver = SOAP::WSDLDriverFactory.new( @@bionmf_wsdl).create_rpc_driver
121
+
122
+ # Upload matrix
123
+ nmf_matrix = driver.upload_matrix(
124
+ File.open(matrix).read, # matrix
125
+ false, # binary
126
+ true, # column labels
127
+ true, # row labels
128
+ true, # transpose
129
+ "No", # positive
130
+ "No", # normalization
131
+ "matrix") # Suggested name
132
+ # Send several executions in parallel
133
+ while !driver.done(nmf_matrix)
134
+ sleep(5)
135
+ end
136
+
137
+ if driver.error(nmf_matrix)
138
+ error = driver.messages(nmf_matrix).join("\n")
139
+ raise "Error pre-processing matrix!" + driver.messages(nmf_matrix).join("\n")
140
+ end
141
+
142
+ threads = []
143
+ error = nil
144
+ executions.times{|i|
145
+ threads << Thread.new(i){ |num|
146
+ times = 3
147
+ begin
148
+
149
+ job_id = driver.standardNMF(
150
+ nmf_matrix, # Matrix job
151
+ "Standard", # Algorithm
152
+ k, # Factor Start
153
+ k, # Factor End
154
+ 1, # Runs
155
+ 2000, # Iterations
156
+ 40, # Stop criteria
157
+ 0, # Not used (nsnmf smoothness)
158
+ false, # extra info
159
+ '') # Suggested name
160
+
161
+ while !driver.done(job_id)
162
+ sleep(5)
163
+ end
164
+
165
+ if driver.error(job_id)
166
+ error = driver.messages(job_id).join("\n")
167
+ raise "Error in NMF" + driver.messages(job_id).join("\n")
168
+ end
169
+
170
+ results = driver.results(job_id)
171
+ fw = File.open(out + ".matrix_w.#{num}",'w')
172
+ fw.write(driver.result(results[0]).sub(/\t(.*)\t$/,'\1'))
173
+ fw.close
174
+ fh = File.open(out + ".matrix_h.#{num}",'w')
175
+ fh.write(driver.result(results[1]).sub(/\t(.*)\t$/,'\1'))
176
+ fh.close
177
+ driver.clean(job_id)
178
+ rescue Sent::ProcessAbortedError
179
+ puts "Process aborted for #{ num }"
180
+ driver.abort(job_id)
181
+ rescue Timeout::Error
182
+ if times > 0
183
+ times -= 1
184
+ sleep 2
185
+ retry
186
+ else
187
+ raise Sent::ProcessAbortedError, "NMF Execution #{ num } timed out"
188
+ end
189
+ rescue Exception
190
+ puts $!.message
191
+ if times > 0
192
+ times -= 1
193
+ puts "Retrying thread #{ num }"
194
+ retry
195
+ else
196
+
197
+ puts "NMF Execution #{ num } Produced Exception"
198
+ puts $!.class
199
+ puts $!.message
200
+ puts $!.backtrace
201
+ raise Sent::ProcessAbortedError, "NMF Execution #{ num } Produced Exception"
202
+ end
203
+ ensure
204
+ Thread.exit
205
+ end
206
+ }
207
+ sleep 1
208
+
209
+ }
210
+
211
+ # Allow threads to be aborted
212
+ aborted = false
213
+ old_int = Signal.trap("INT") do
214
+ STDERR.puts "Killing threads"
215
+ threads.each{|t| t.raise Sent::ProcessAbortedError, "Process Aborted"}
216
+ aborted = true
217
+ end
218
+
219
+ threads.each { |aThread| aThread.join }
220
+
221
+ Signal.trap("INT", old_int)
222
+ driver.clean(nmf_matrix)
223
+
224
+ if aborted
225
+ raise Sent::ProcessAbortedError, "Process Aborted"
226
+ end
227
+
228
+ if error
229
+ raise Exception, "Error in NMF:\n" + error
230
+ end
231
+
232
+ run_R("SENT.join.results('#{ out }')")
233
+
234
+ FileUtils.rm Dir.glob(out + '.matrix_*.*')
235
+ end
236
+
237
+ def self.CCC(matrix, kstart, kend)
238
+ raise "Error in range: #{ kstart } to #{ kend }" if kstart >= kend
239
+
240
+ driver = SOAP::WSDLDriverFactory.new( @@bionmf_wsdl).create_rpc_driver
241
+
242
+ # Prepare matrix for processing
243
+ nmf_matrix = driver.upload_matrix(File.open(matrix).read)
244
+ driver.preprocess(nmf_matrix,1,"No","No", true, true)
245
+
246
+ job_id = driver.sample_classification(nmf_matrix,kstart.to_i,kend.to_i,10)
247
+
248
+ aborted = false
249
+ old_int = Signal.trap("INT") do
250
+ puts "Aborting bestK process"
251
+ driver.abort(job_id)
252
+ aborted = true
253
+ end
254
+
255
+ while (status = driver.status(job_id)) == 0
256
+ sleep(5)
257
+ end
258
+
259
+ driver.clean_matrix(nmf_matrix)
260
+ Signal.trap("INT", old_int)
261
+
262
+ if aborted
263
+ raise Sent::ProcessAbortedError, "Process Aborted"
264
+ end
265
+
266
+ if status == -1
267
+ raise Sent::WSError, "Error processing matrix:\n" + driver.info(job_id)
268
+ end
269
+
270
+ results = driver.results(job_id)
271
+ text = driver.get_result(results[0])
272
+ text.split(/\n/s).last.split(/\t/)
273
+ end
274
+
275
+ def self.analyze(prefix, output, clusters = nil, num_words = 15)
276
+
277
+ FileUtils.rm Dir.glob(output + '*.words') + Dir.glob(output + '*.genes')
278
+ run_R("SENT.analyze('#{ prefix }', '#{ output }', '#{clusters}', '#{num_words}')")
279
+ words = Dir.glob(output + '*.words').sort.collect{|f| Open.read(f).split(/\n/)}
280
+ genes = Dir.glob(output + '*.genes').sort.collect{|f| Open.read(f).split(/\n/)}
281
+
282
+ groups = []
283
+ words.zip(genes).each{|p|
284
+ groups << {:words => p[0], :genes => p[1]}
285
+ }
286
+
287
+ Open.write(output + '.summary', groups.to_yaml)
288
+ end
289
+
290
+ def self.literature_index(pmids, outfile)
291
+
292
+ index = Ferret::Index::Index.new(:path => outfile)
293
+
294
+ index.field_infos.add_field(:title, :index => :yes, :boost => 0.67)
295
+ index.field_infos.add_field(:abstract, :index => :yes, :boost => 0.33)
296
+
297
+ Progress.monitor("Building index for #{pmids.length} articles")
298
+ pmids.each{|pmid|
299
+ begin
300
+ article = PubMed.get_article(pmid)
301
+ abstract = article.abstract
302
+ title = article.title
303
+
304
+ abstract_content = BagOfWords.terms(abstract).collect{|w,n| (1..n).collect{ w }}.flatten.join(" ")
305
+ title_content = BagOfWords.terms(title).collect{|w,n| (1..n).collect{ w }}.flatten.join(" ")
306
+
307
+ index << {:id => pmid, :abstract => abstract_content, :name => title_content}
308
+ rescue Exception
309
+ puts $!.backtrace
310
+ puts $!.message
311
+ end
312
+
313
+ }
314
+ index.close
315
+ end
316
+
317
+ def self.search_index(words, index)
318
+ index = Ferret::Index::Index.new(:path => index)
319
+
320
+ ranks = []
321
+ index.search_each("#{ words.collect{|w| w.stem}.join(" ") }", :limit => 8000) do |id,score|
322
+ next unless score > 0.0001
323
+ ranks << [index[id][:id],score]
324
+ end
325
+
326
+ ranks
327
+ end
328
+ end
data/lib/sent.rb ADDED
@@ -0,0 +1,71 @@
1
+ $:.unshift(File.dirname(__FILE__)) unless
2
+ $:.include?(File.dirname(__FILE__)) || $:.include?(File.expand_path(File.dirname(__FILE__)))
3
+
4
+ require 'yaml'
5
+ require 'fileutils'
6
+ require 'stemmer'
7
+ require 'ferret'
8
+
9
+ module Sent
10
+
11
+ class NoConfigError < StandardError; end
12
+
13
+ @@rootdir = File.dirname(File.dirname(__FILE__))
14
+ @@datadir = @@workdir = @@tmpdir = nil
15
+
16
+ def self.load_config
17
+ if File.exist?File.join(@@rootdir, 'sent.config')
18
+ config = YAML.load_file(File.join(@@rotdir, 'sent.config'))
19
+ if config.is_a? Hash
20
+ @@datadir = config['datadir'] if config['datadir']
21
+ @@workdir = config['workdir'] if config['workdir']
22
+ @@tmpdir = config['tmpdir'] if config['tmpdir']
23
+ end
24
+ end
25
+
26
+
27
+
28
+ if File.exist?(File.join(ENV['HOME'], '.sent'))
29
+ config = YAML.load_file(File.join(ENV['HOME'], '.sent') )
30
+ if config.is_a? Hash
31
+ @@datadir = config['datadir'] if config['datadir']
32
+ @@workdir = config['workdir'] if config['workdir']
33
+ @@tmpdir = config['tmpdir'] if config['tmpdir']
34
+ end
35
+ end
36
+
37
+ if @@datadir.nil? || @@workdir.nil? || @@tmpdir.nil?
38
+ raise Sent::NoConfig, "sent not configured. Edit #{File.join(@@rootdir, 'sent.config')} or $HOME/.sent"
39
+ end
40
+
41
+
42
+ FileUtils.mkdir_p @@datadir unless File.exist? @@datadir
43
+ FileUtils.mkdir_p @@workdir unless File.exist? @@workdir
44
+ FileUtils.mkdir_p @@tmpdir unless File.exist? @@tmpdir
45
+
46
+ end
47
+
48
+ def self.datadir
49
+ @@datadir
50
+ end
51
+ def self.workdir
52
+ @@workdir
53
+ end
54
+ def self.tmpdir
55
+ @@tmpdir
56
+ end
57
+
58
+ def self.rootdir
59
+ @@rootdir
60
+ end
61
+
62
+
63
+ def self.rdir
64
+ File.join(@@rootdir, 'R')
65
+ end
66
+
67
+ self.load_config
68
+ end
69
+
70
+
71
+
@@ -0,0 +1,17 @@
1
+ require 'sent'
2
+
3
+ $datadir = Sent.datadir
4
+ $scriptdir = File.join(Sent.rootdir, '/install_scripts')
5
+
6
+ task 'analysis' do
7
+ directory = "#{$datadir}/analysis"
8
+ FileUtils.mkdir_p directory
9
+ %w(Rakefile).each{|f|
10
+ FileUtils.cp_r File.join($scriptdir, "analysis/#{ f }"), directory
11
+ }
12
+
13
+ %w(associations metadocs matrices NMF summary).each{|d|
14
+ FileUtils.mkdir_p File.join(directory, d)
15
+ }
16
+ end
17
+
data/test/helper.rb ADDED
@@ -0,0 +1,9 @@
1
+ require 'rubygems'
2
+ require 'test/unit'
3
+
4
+ $LOAD_PATH.unshift(File.join(File.dirname(__FILE__), '..', 'lib'))
5
+ $LOAD_PATH.unshift(File.dirname(__FILE__))
6
+ require 'sent'
7
+
8
+ class Test::Unit::TestCase
9
+ end
metadata ADDED
@@ -0,0 +1,133 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: rbbt-sent
3
+ version: !ruby/object:Gem::Version
4
+ version: 1.0.0
5
+ platform: ruby
6
+ authors:
7
+ - Miguel Vazquez
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+
12
+ date: 2009-11-09 00:00:00 +01:00
13
+ default_executable: sent_config
14
+ dependencies:
15
+ - !ruby/object:Gem::Dependency
16
+ name: rake
17
+ type: :runtime
18
+ version_requirement:
19
+ version_requirements: !ruby/object:Gem::Requirement
20
+ requirements:
21
+ - - ">="
22
+ - !ruby/object:Gem::Version
23
+ version: "0"
24
+ version:
25
+ - !ruby/object:Gem::Dependency
26
+ name: simpleconsole
27
+ type: :runtime
28
+ version_requirement:
29
+ version_requirements: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - ">="
32
+ - !ruby/object:Gem::Version
33
+ version: "0"
34
+ version:
35
+ - !ruby/object:Gem::Dependency
36
+ name: rbbt
37
+ type: :runtime
38
+ version_requirement:
39
+ version_requirements: !ruby/object:Gem::Requirement
40
+ requirements:
41
+ - - ">="
42
+ - !ruby/object:Gem::Version
43
+ version: "0"
44
+ version:
45
+ - !ruby/object:Gem::Dependency
46
+ name: ferret
47
+ type: :runtime
48
+ version_requirement:
49
+ version_requirements: !ruby/object:Gem::Requirement
50
+ requirements:
51
+ - - ">="
52
+ - !ruby/object:Gem::Version
53
+ version: "0"
54
+ version:
55
+ - !ruby/object:Gem::Dependency
56
+ name: stemmer
57
+ type: :runtime
58
+ version_requirement:
59
+ version_requirements: !ruby/object:Gem::Requirement
60
+ requirements:
61
+ - - ">="
62
+ - !ruby/object:Gem::Version
63
+ version: "0"
64
+ version:
65
+ - !ruby/object:Gem::Dependency
66
+ name: progress-monitor
67
+ type: :runtime
68
+ version_requirement:
69
+ version_requirements: !ruby/object:Gem::Requirement
70
+ requirements:
71
+ - - ">="
72
+ - !ruby/object:Gem::Version
73
+ version: "0"
74
+ version:
75
+ - !ruby/object:Gem::Dependency
76
+ name: open4
77
+ type: :runtime
78
+ version_requirement:
79
+ version_requirements: !ruby/object:Gem::Requirement
80
+ requirements:
81
+ - - ">="
82
+ - !ruby/object:Gem::Version
83
+ version: "0"
84
+ version:
85
+ description: Use literature mining to find semantic features to describe clusers of genes
86
+ email: miguel.vazquez@fdi.ucm.es
87
+ executables:
88
+ - sent_config
89
+ extensions: []
90
+
91
+ extra_rdoc_files:
92
+ - LICENSE
93
+ - README.rdoc
94
+ files:
95
+ - R/matrix.R
96
+ - bin/sent_config
97
+ - install_scripts/analysis/Rakefile
98
+ - lib/sent.rb
99
+ - lib/sent/main.rb
100
+ - tasks/install.rake
101
+ - test/helper.rb
102
+ - LICENSE
103
+ - README.rdoc
104
+ has_rdoc: true
105
+ homepage: http://github.com/mikisvaz/rbbt-sent
106
+ licenses: []
107
+
108
+ post_install_message:
109
+ rdoc_options:
110
+ - --charset=UTF-8
111
+ require_paths:
112
+ - lib
113
+ required_ruby_version: !ruby/object:Gem::Requirement
114
+ requirements:
115
+ - - ">="
116
+ - !ruby/object:Gem::Version
117
+ version: "0"
118
+ version:
119
+ required_rubygems_version: !ruby/object:Gem::Requirement
120
+ requirements:
121
+ - - ">="
122
+ - !ruby/object:Gem::Version
123
+ version: "0"
124
+ version:
125
+ requirements: []
126
+
127
+ rubyforge_project:
128
+ rubygems_version: 1.3.5
129
+ signing_key:
130
+ specification_version: 3
131
+ summary: Semantic Features in Text
132
+ test_files:
133
+ - test/helper.rb