rbbt 1.0.0
Sign up to get free protection for your applications and to get access to all the features.
- data/LICENSE +20 -0
- data/README.rdoc +17 -0
- data/bin/rbbt_config +180 -0
- data/install_scripts/classifier/R/classify.R +36 -0
- data/install_scripts/classifier/Rakefile +140 -0
- data/install_scripts/get_abner.sh +2 -0
- data/install_scripts/get_banner.sh +25 -0
- data/install_scripts/get_biocreative.sh +72 -0
- data/install_scripts/get_crf++.sh +26 -0
- data/install_scripts/get_entrez.sh +4 -0
- data/install_scripts/get_go.sh +4 -0
- data/install_scripts/get_polysearch.sh +8 -0
- data/install_scripts/ner/Rakefile +206 -0
- data/install_scripts/ner/config/default.rb +52 -0
- data/install_scripts/norm/Rakefile +218 -0
- data/install_scripts/norm/config/cue_default.rb +10 -0
- data/install_scripts/norm/config/tokens_default.rb +79 -0
- data/install_scripts/norm/functions.sh +21 -0
- data/install_scripts/organisms/Rakefile +25 -0
- data/install_scripts/organisms/cgd.Rakefile +84 -0
- data/install_scripts/organisms/human.Rakefile +145 -0
- data/install_scripts/organisms/mgi.Rakefile +77 -0
- data/install_scripts/organisms/pombe.Rakefile +40 -0
- data/install_scripts/organisms/rake-include.rb +258 -0
- data/install_scripts/organisms/rgd.Rakefile +88 -0
- data/install_scripts/organisms/sgd.Rakefile +66 -0
- data/install_scripts/organisms/tair.Rakefile +54 -0
- data/install_scripts/organisms/worm.Rakefile +109 -0
- data/install_scripts/stopwords +1 -0
- data/install_scripts/wordlists/consonants +897 -0
- data/install_scripts/wordlists/stopwords +1 -0
- data/lib/rbbt/bow/bow.rb +87 -0
- data/lib/rbbt/bow/classifier.rb +118 -0
- data/lib/rbbt/bow/dictionary.rb +218 -0
- data/lib/rbbt/ner/abner.rb +34 -0
- data/lib/rbbt/ner/banner.rb +73 -0
- data/lib/rbbt/ner/regexpNER.rb +62 -0
- data/lib/rbbt/ner/rner.rb +227 -0
- data/lib/rbbt/ner/rnorm/cue_index.rb +80 -0
- data/lib/rbbt/ner/rnorm/tokens.rb +213 -0
- data/lib/rbbt/ner/rnorm.rb +142 -0
- data/lib/rbbt/sources/biocreative.rb +75 -0
- data/lib/rbbt/sources/biomart.rb +106 -0
- data/lib/rbbt/sources/entrez.rb +211 -0
- data/lib/rbbt/sources/go.rb +40 -0
- data/lib/rbbt/sources/organism.rb +197 -0
- data/lib/rbbt/sources/polysearch.rb +88 -0
- data/lib/rbbt/sources/pubmed.rb +111 -0
- data/lib/rbbt/util/arrayHash.rb +255 -0
- data/lib/rbbt/util/filecache.rb +72 -0
- data/lib/rbbt/util/index.rb +69 -0
- data/lib/rbbt/util/misc.rb +101 -0
- data/lib/rbbt/util/open.rb +207 -0
- data/lib/rbbt/util/simpleDSL.rb +87 -0
- data/lib/rbbt/util/tmpfile.rb +19 -0
- data/lib/rbbt/version.rb +10 -0
- data/lib/rbbt.rb +86 -0
- data/tasks/install.rake +123 -0
- metadata +114 -0
@@ -0,0 +1,88 @@
|
|
1
|
+
require 'rbbt'
|
2
|
+
require 'rbbt/util/open'
|
3
|
+
require 'rbbt/ner/regexpNER'
|
4
|
+
|
5
|
+
# Find terms in the Polysearch thesauri using simple regular expression
|
6
|
+
# matching. Note that the first time the methods are used the correspondent
|
7
|
+
# thesaurus are loaded into memory. The available thesauri are: disease, drug,
|
8
|
+
# metabolite, organ, subcellular (subcellular localization) and tissue.
|
9
|
+
module Polysearch
|
10
|
+
|
11
|
+
|
12
|
+
@@names = {}
|
13
|
+
def self.type_names(type) #:nodoc:
|
14
|
+
@@names[type] ||= Open.to_hash(File.join(Rbbt.datadir,'dbs','polysearch',type + '.txt'), :single => true)
|
15
|
+
end
|
16
|
+
|
17
|
+
|
18
|
+
@@indexes = {}
|
19
|
+
def self.type_index(type) #:nodoc:
|
20
|
+
@@indexes[type] ||= RegExpNER.new(File.join(Rbbt.datadir,'dbs','polysearch',type + '.txt'))
|
21
|
+
end
|
22
|
+
|
23
|
+
# Find matches in a string of text, the types array specifies which thesauri
|
24
|
+
# to use, if if nil it will use all.
|
25
|
+
def self.match(text, types = nil)
|
26
|
+
if types.nil?
|
27
|
+
types = %w(disease drug metabolite organ subcellular tissue)
|
28
|
+
end
|
29
|
+
|
30
|
+
types = [types] unless Array === types
|
31
|
+
types = types.sort
|
32
|
+
|
33
|
+
matches = {}
|
34
|
+
types.collect{|type|
|
35
|
+
matches.merge!(type_index(type).match_hash(text))
|
36
|
+
}
|
37
|
+
|
38
|
+
matches
|
39
|
+
end
|
40
|
+
|
41
|
+
# Transform the code into a name, type is the thesaurus to use
|
42
|
+
def self.name(type, code)
|
43
|
+
type_names(type)[code]
|
44
|
+
end
|
45
|
+
|
46
|
+
end
|
47
|
+
|
48
|
+
if __FILE__ == $0
|
49
|
+
|
50
|
+
text =<<-EOT
|
51
|
+
|
52
|
+
Background Microorganisms adapt their transcriptome by integrating
|
53
|
+
multiple chemical and physical signals from their environment. Shake-flask
|
54
|
+
cultivation does not allow precise manipulation of individual culture
|
55
|
+
parameters and therefore precludes a quantitative analysis of the
|
56
|
+
(combinatorial) influence of these parameters on transcriptional
|
57
|
+
regulation. Steady-state chemostat cultures, which do enable accurate
|
58
|
+
control, measurement and manipulation of individual cultivation parameters
|
59
|
+
(e.g. specific growth rate, temperature, identity of the growth-limiting
|
60
|
+
nutrient) appear to provide a promising experimental platform for such a
|
61
|
+
combinatorial analysis. Results A microarray compendium of 170
|
62
|
+
steady-state chemostat cultures of the yeast Saccharomyces cerevisiae is
|
63
|
+
presented and analyzed. The 170 microarrays encompass 55 unique
|
64
|
+
conditions, which can be characterized by the combined settings of 10
|
65
|
+
different cultivation parameters. By applying a regression model to assess
|
66
|
+
the impact of (combinations of) cultivation parameters on the
|
67
|
+
transcriptome, most S. cerevisiae genes were shown to be influenced by
|
68
|
+
multiple cultivation parameters, and in many cases by combinatorial
|
69
|
+
effects of cultivation parameters. The inclusion of these combinatorial
|
70
|
+
effects in the regression model led to higher explained variance of the
|
71
|
+
gene expression patterns and resulted in higher function enrichment in
|
72
|
+
subsequent analysis. We further demonstrate the usefulness of the
|
73
|
+
compendium and regression analysis for interpretation of shake-flask-based
|
74
|
+
transcriptome studies and for guiding functional analysis of
|
75
|
+
(uncharacterized) genes and pathways. Conclusions Modeling the
|
76
|
+
combinatorial effects of environmental parameters on the transcriptome is
|
77
|
+
crucial for understanding transcriptional regulation. Chemostat
|
78
|
+
cultivation offers a powerful tool for such an approach. Keywords:
|
79
|
+
chemostat steady state samples
|
80
|
+
Cerebellar
|
81
|
+
stroke syndrome
|
82
|
+
|
83
|
+
|
84
|
+
EOT
|
85
|
+
|
86
|
+
p Polysearch.match(text,'disease').values.flatten
|
87
|
+
|
88
|
+
end
|
@@ -0,0 +1,111 @@
|
|
1
|
+
require 'rbbt/util/filecache'
|
2
|
+
require 'rbbt/util/open'
|
3
|
+
require 'rbbt'
|
4
|
+
|
5
|
+
# This module offers an interface with PubMed, to perform queries, and
|
6
|
+
# retrieve simple information from articles. It uses the caching
|
7
|
+
# services of Rbbt.
|
8
|
+
module PubMed
|
9
|
+
|
10
|
+
private
|
11
|
+
@@last = Time.now
|
12
|
+
@@pubmed_lag = 1
|
13
|
+
def self.get_online(pmids)
|
14
|
+
|
15
|
+
pmid_list = ( pmids.is_a?(Array) ? pmids.join(',') : pmids.to_s )
|
16
|
+
url = "http://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=pubmed&retmode=xml&id=#{pmid_list}"
|
17
|
+
|
18
|
+
diff = Time.now - @@last
|
19
|
+
sleep @@pubmed_lag - diff unless diff > @@pubmed_lag
|
20
|
+
|
21
|
+
xml = Open.read(url, :quiet => true, :nocache => true)
|
22
|
+
|
23
|
+
@@last = Time.now
|
24
|
+
|
25
|
+
articles = xml.scan(/(<PubmedArticle>.*?<\/PubmedArticle>)/sm).flatten
|
26
|
+
|
27
|
+
if pmids.is_a? Array
|
28
|
+
list = {}
|
29
|
+
articles.each{|article|
|
30
|
+
pmid = article.scan(/<PMID>(.*?)<\/PMID>/).flatten.first
|
31
|
+
list[pmid] = article
|
32
|
+
}
|
33
|
+
return list
|
34
|
+
else
|
35
|
+
return articles.first
|
36
|
+
end
|
37
|
+
|
38
|
+
end
|
39
|
+
|
40
|
+
public
|
41
|
+
|
42
|
+
# Processes the xml with an articles as served by MedLine and extracts
|
43
|
+
# the abstract, title and journal information
|
44
|
+
class Article
|
45
|
+
attr_reader :title, :abstract, :journal
|
46
|
+
def initialize(xml)
|
47
|
+
xml ||= ""
|
48
|
+
@abstract = $1 if xml.match(/<AbstractText>(.*)<\/AbstractText>/sm)
|
49
|
+
@title = $1 if xml.match(/<ArticleTitle>(.*)<\/ArticleTitle>/sm)
|
50
|
+
@journal = $1 if xml.match(/<Title>(.*)<\/Title>/sm)
|
51
|
+
end
|
52
|
+
|
53
|
+
# Join the text from title and abstract
|
54
|
+
def text
|
55
|
+
[@title, @abstract].join("\n")
|
56
|
+
end
|
57
|
+
end
|
58
|
+
|
59
|
+
# Returns the Article object containing the information for the PubMed
|
60
|
+
# ID specified as an argument. If +pmid+ is an array instead of a single
|
61
|
+
# identifier it returns an hash with the Article object for each id.
|
62
|
+
# It uses the Rbbt cache to save the articles xml.
|
63
|
+
def self.get_article(pmid)
|
64
|
+
|
65
|
+
if pmid.is_a? Array
|
66
|
+
missing = []
|
67
|
+
list = {}
|
68
|
+
|
69
|
+
pmid.each{|p|
|
70
|
+
filename = p.to_s + '.xml'
|
71
|
+
if File.exists? FileCache.path(filename)
|
72
|
+
list[p] = Article.new(Open.read(FileCache.path(filename)))
|
73
|
+
else
|
74
|
+
missing << p
|
75
|
+
end
|
76
|
+
}
|
77
|
+
|
78
|
+
return list unless missing.any?
|
79
|
+
articles = get_online(missing)
|
80
|
+
|
81
|
+
articles.each{|p, xml|
|
82
|
+
filename = p + '.xml'
|
83
|
+
FileCache.add_file(filename,xml, :force => true)
|
84
|
+
list[p] = Article.new(xml)
|
85
|
+
}
|
86
|
+
|
87
|
+
return list
|
88
|
+
|
89
|
+
else
|
90
|
+
filename = pmid.to_s + '.xml'
|
91
|
+
|
92
|
+
if File.exists? FileCache.path(filename)
|
93
|
+
return Article.new(Open.read(FileCache.path(filename)))
|
94
|
+
else
|
95
|
+
xml = get_online(pmid)
|
96
|
+
FileCache.add_file(filename,xml)
|
97
|
+
|
98
|
+
return Article.new(xml)
|
99
|
+
end
|
100
|
+
end
|
101
|
+
end
|
102
|
+
|
103
|
+
# Performs the specified query and returns an array with the PubMed
|
104
|
+
# Ids returned. +retmax+ can be used to limit the number of ids
|
105
|
+
# returned, if is not specified 30000 is used.
|
106
|
+
def self.query(query, retmax=nil)
|
107
|
+
retmax ||= 30000
|
108
|
+
|
109
|
+
Open.read("http://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?retmax=#{retmax}&db=pubmed&term=#{query}",:quiet => true, :nocache => true).scan(/<Id>(\d+)<\/Id>/).flatten
|
110
|
+
end
|
111
|
+
end
|
@@ -0,0 +1,255 @@
|
|
1
|
+
|
2
|
+
class ArrayHash
|
3
|
+
|
4
|
+
# Take two strings of elements separated by the character sep_char and join them
|
5
|
+
# into one, removing repetitions.
|
6
|
+
def self.merge_values_string(list1, list2, sep_char ='|')
|
7
|
+
elem1 = list1.to_s.split(sep_char)
|
8
|
+
elem2 = list2.to_s.split(sep_char)
|
9
|
+
(elem1 + elem2).select{|e| e.to_s != ""}.uniq.join(sep_char)
|
10
|
+
end
|
11
|
+
|
12
|
+
# Merge two lists of elements. Elements could be strings of elements
|
13
|
+
# separated by the character sep_char, or arrays of lists of such strings.
|
14
|
+
def self.merge_values(list1, list2, sep_char = "|")
|
15
|
+
if String === list1 || String === list2
|
16
|
+
return merge_values_string(list1, list2)
|
17
|
+
end
|
18
|
+
|
19
|
+
if list1.nil?
|
20
|
+
list1 = [''] * list2.length
|
21
|
+
end
|
22
|
+
|
23
|
+
if list2.nil?
|
24
|
+
list2 = [''] * list1.length
|
25
|
+
end
|
26
|
+
|
27
|
+
new = []
|
28
|
+
list1.each_with_index{|elem, i|
|
29
|
+
new << merge_values_string(elem, list2[i], sep_char)
|
30
|
+
}
|
31
|
+
new
|
32
|
+
end
|
33
|
+
|
34
|
+
|
35
|
+
# Take an hash of arrays and a position and use the value at that position
|
36
|
+
# of the arrays and build a new hash with that value as key, and the original
|
37
|
+
# key prepended to the arrays. The options hash appcepts the following keys
|
38
|
+
# :case_insensitive, which defaults to true, and :index, which indicates that
|
39
|
+
# the original key should be the value of the hash entry, instead of the
|
40
|
+
# complete array of values.
|
41
|
+
def self.pullout(hash, pos, options = {})
|
42
|
+
index = options[:index]; index = false if index.nil?
|
43
|
+
case_insensitive = options[:case_insensitive]; case_insensitive = true if case_insensitive.nil?
|
44
|
+
|
45
|
+
new = {}
|
46
|
+
hash.each{|key,values|
|
47
|
+
code = values[pos].to_s
|
48
|
+
next if code == ""
|
49
|
+
|
50
|
+
if index
|
51
|
+
list = key
|
52
|
+
else
|
53
|
+
list = [key] + values
|
54
|
+
list.delete_at(pos + 1)
|
55
|
+
end
|
56
|
+
|
57
|
+
code.split("|").each{|c|
|
58
|
+
c = c.downcase if case_insensitive
|
59
|
+
new[c] = merge_values(new[c], list)
|
60
|
+
}
|
61
|
+
}
|
62
|
+
|
63
|
+
if case_insensitive
|
64
|
+
class << new; self; end.instance_eval{
|
65
|
+
alias_method :old_get, :[]
|
66
|
+
define_method(:[], proc{|key| old_get(key.to_s.downcase)})
|
67
|
+
}
|
68
|
+
end
|
69
|
+
|
70
|
+
new
|
71
|
+
end
|
72
|
+
|
73
|
+
# Merge to hashes of arrays. Each hash contains a number of fields for each
|
74
|
+
# entry. The pos1 and pos2 indicate what fields should be used to match
|
75
|
+
# entries, the values for pos1 and pos2 can be an integer indicating the
|
76
|
+
# position in the array or the symbol :main to refer to the key of the hash.
|
77
|
+
# The options hash accepts the key :case_insensitive, which defaults to true.
|
78
|
+
def self.merge(hash1, hash2, pos1 = :main, pos2 = :main, options = {})
|
79
|
+
|
80
|
+
case_insensitive = options[:case_insensitive]; case_insensitive = true if case_insensitive.nil?
|
81
|
+
if pos1.to_s.downcase != 'main'
|
82
|
+
index1 = pullout(hash1, pos1, options.merge(:index => true))
|
83
|
+
elsif options[:case_insensitive]
|
84
|
+
new = {}
|
85
|
+
hash1.each{|k,v|
|
86
|
+
new[k.to_s.downcase] = v
|
87
|
+
}
|
88
|
+
class << new; self; end.instance_eval{
|
89
|
+
alias_method :old_get, :[]
|
90
|
+
define_method(:[], proc{|key| old_get(key.to_s.downcase)})
|
91
|
+
}
|
92
|
+
hash1 = new
|
93
|
+
end
|
94
|
+
|
95
|
+
length1 = hash1.values.first.length
|
96
|
+
length2 = hash2.values.first.length
|
97
|
+
|
98
|
+
new = {}
|
99
|
+
hash2.each{|key, values|
|
100
|
+
case
|
101
|
+
when pos2.to_s.downcase == 'main'
|
102
|
+
k = key
|
103
|
+
v = values
|
104
|
+
when Fixnum === pos2
|
105
|
+
k = values[pos2]
|
106
|
+
v = values
|
107
|
+
v.delete_at(pos2)
|
108
|
+
v.unshift(key)
|
109
|
+
else
|
110
|
+
raise "Format of second index not understood"
|
111
|
+
end
|
112
|
+
|
113
|
+
code = (index1.nil? ? k : index1[k])
|
114
|
+
if code
|
115
|
+
code.split('|').each{|c|
|
116
|
+
c = c.to_s.downcase if options[:case_insensitive]
|
117
|
+
new[c] = hash1[c] || [''] * length1
|
118
|
+
new[c] += v
|
119
|
+
}
|
120
|
+
end
|
121
|
+
}
|
122
|
+
|
123
|
+
hash1.each{|key, values|
|
124
|
+
new[key] ||= values + [''] * length2
|
125
|
+
}
|
126
|
+
|
127
|
+
new
|
128
|
+
end
|
129
|
+
|
130
|
+
# For a given hash of arrays, filter the position pos of each array with the
|
131
|
+
# block of code.
|
132
|
+
def self.process(hash, pos, &block)
|
133
|
+
new = {}
|
134
|
+
hash.each{|key, values|
|
135
|
+
v = values
|
136
|
+
v[pos] = v[pos].to_s.split("|").collect{|n| block.call(n)}.join("|")
|
137
|
+
new[key] = v
|
138
|
+
}
|
139
|
+
new
|
140
|
+
end
|
141
|
+
|
142
|
+
# Clean structure for repeated values. If the same value apear two times use
|
143
|
+
# eliminate the one that appears latter on the values list (columns of the
|
144
|
+
# ArrayHash are assumed to be sorted for importance) if the appear on the
|
145
|
+
# same position, remove the one with the smaller vale of the code after
|
146
|
+
# turning it into integer.
|
147
|
+
def self.clean(hash, options = {})
|
148
|
+
case_sensitive = options[:case_sensitive]
|
149
|
+
|
150
|
+
found = {}
|
151
|
+
|
152
|
+
hash.each{|k, list|
|
153
|
+
list.each_with_index{|values,i|
|
154
|
+
(String === values ? values.split("|") : values).each{|v|
|
155
|
+
v = v.downcase if case_sensitive
|
156
|
+
if found[v].nil?
|
157
|
+
found[v] = [k,i]
|
158
|
+
else
|
159
|
+
last_k, last_i = found[v].values_at(0,1)
|
160
|
+
if last_i > i || (last_i == i && last_k.to_i > k.to_i)
|
161
|
+
found[v] = [k,i]
|
162
|
+
end
|
163
|
+
end
|
164
|
+
}
|
165
|
+
}
|
166
|
+
}
|
167
|
+
|
168
|
+
new_hash = {}
|
169
|
+
hash.each{|k,list|
|
170
|
+
new_list = []
|
171
|
+
list.each_with_index{|values,i|
|
172
|
+
new_values = []
|
173
|
+
(String === values ? values.split("|") : values).each{|v|
|
174
|
+
found_k, found_i = found[(case_sensitive ? v.downcase : v )].values_at(0,1)
|
175
|
+
if found_i == i && found_k == k
|
176
|
+
new_values << v
|
177
|
+
end
|
178
|
+
}
|
179
|
+
new_list << (String === values ? new_values.join("|") : values)
|
180
|
+
}
|
181
|
+
new_hash[k] = new_list
|
182
|
+
}
|
183
|
+
new_hash
|
184
|
+
end
|
185
|
+
|
186
|
+
attr_reader :main, :fields, :data
|
187
|
+
def initialize(hash, main, fields = nil)
|
188
|
+
@data = hash
|
189
|
+
@main = main.to_s
|
190
|
+
|
191
|
+
if fields.nil?
|
192
|
+
l = hash.values.first.length
|
193
|
+
fields = []
|
194
|
+
l.times{|i| fields << "F#{i}"}
|
195
|
+
end
|
196
|
+
|
197
|
+
@fields = fields.collect{|f| f.to_s}
|
198
|
+
end
|
199
|
+
|
200
|
+
# Wrapper
|
201
|
+
def process(field, &block)
|
202
|
+
pos = self.field_pos(field)
|
203
|
+
@data = ArrayHash.process(self.data, pos, &block)
|
204
|
+
self
|
205
|
+
end
|
206
|
+
|
207
|
+
# Returns the position of a given field in the value arrays
|
208
|
+
def field_pos(field)
|
209
|
+
return :main if field == :main
|
210
|
+
if field.downcase == self.main.downcase
|
211
|
+
return :main
|
212
|
+
else
|
213
|
+
@fields.collect{|f| f.downcase}.index(field.to_s.downcase)
|
214
|
+
end
|
215
|
+
end
|
216
|
+
|
217
|
+
|
218
|
+
# Merge two ArrayHashes using the specified field
|
219
|
+
def merge(other, field = :main, options = {} )
|
220
|
+
field = self.main if field == :main
|
221
|
+
|
222
|
+
pos1 = self.field_pos(field)
|
223
|
+
pos2 = other.field_pos(field)
|
224
|
+
|
225
|
+
new = ArrayHash.merge(self.data, other.data, pos1, pos2, options)
|
226
|
+
@data = new
|
227
|
+
if pos2 == :main
|
228
|
+
new_fields = other.fields
|
229
|
+
else
|
230
|
+
new_fields = other.fields
|
231
|
+
new_fields.delete_at(pos2)
|
232
|
+
new_fields.unshift(other.main)
|
233
|
+
end
|
234
|
+
@fields += new_fields
|
235
|
+
self
|
236
|
+
end
|
237
|
+
|
238
|
+
# Remove a field from the ArrayHash
|
239
|
+
def remove(field)
|
240
|
+
pos = self.field_pos(field)
|
241
|
+
return if pos.nil?
|
242
|
+
@data = self.data.each{|key,values| values.delete_at(pos)}
|
243
|
+
@fields.delete_at(pos)
|
244
|
+
self
|
245
|
+
end
|
246
|
+
|
247
|
+
def clean
|
248
|
+
@data = ArrayHash.clean(@data)
|
249
|
+
self
|
250
|
+
end
|
251
|
+
end
|
252
|
+
|
253
|
+
|
254
|
+
|
255
|
+
|
@@ -0,0 +1,72 @@
|
|
1
|
+
require 'fileutils'
|
2
|
+
require 'rbbt'
|
3
|
+
|
4
|
+
# Provides caching functionality for files downloaded from the internet
|
5
|
+
module FileCache
|
6
|
+
|
7
|
+
class BadPathError < StandardError; end
|
8
|
+
class FileExistsError < StandardError; end
|
9
|
+
|
10
|
+
private
|
11
|
+
|
12
|
+
# Remove slash characters from filename.
|
13
|
+
def self.clean_path(filename)
|
14
|
+
filename.gsub(/\//,'_SLASH_')
|
15
|
+
end
|
16
|
+
|
17
|
+
# Check that the file name is safe and is in the correct format
|
18
|
+
def self.sanity_check(filename)
|
19
|
+
if filename =~ /\//
|
20
|
+
raise FileCache::BadPathError, "Character / not allowed in name: #{ filename }"
|
21
|
+
end
|
22
|
+
if filename !~ /.+\..+/
|
23
|
+
raise FileCache::BadPathError, "Filename must have name and extension: name.ext"
|
24
|
+
end
|
25
|
+
end
|
26
|
+
|
27
|
+
public
|
28
|
+
|
29
|
+
# Find the path that a particular file would have in the cache
|
30
|
+
def self.path(filename)
|
31
|
+
sanity_check(filename)
|
32
|
+
|
33
|
+
name, extension = filename.match(/(.+)\.(.+)/).values_at(1,2)
|
34
|
+
dirs = name.scan(/./).reverse.values_at(0,1,2,3,4).reverse.compact.join('/')
|
35
|
+
|
36
|
+
return File.join(File.join(Rbbt.cachedir,dirs),filename)
|
37
|
+
end
|
38
|
+
|
39
|
+
# Add a file in the cache. Raise exception if exists, unless force is
|
40
|
+
# used.
|
41
|
+
def self.add_file(filename, content, options = {})
|
42
|
+
sanity_check(filename)
|
43
|
+
|
44
|
+
path = path(filename)
|
45
|
+
FileUtils.makedirs(File.dirname(path), :mode => 0777)
|
46
|
+
|
47
|
+
if File.exist?(path) and ! (options[:force] || options['force'])
|
48
|
+
raise FileCache::FileExistsError, "File #{filename} already in cache"
|
49
|
+
end
|
50
|
+
|
51
|
+
File.open(path,'w'){|f|
|
52
|
+
f.write(content)
|
53
|
+
}
|
54
|
+
FileUtils.chmod 0666, path
|
55
|
+
|
56
|
+
nil
|
57
|
+
end
|
58
|
+
|
59
|
+
# Removes the file from cache
|
60
|
+
def self.del_file(filename)
|
61
|
+
sanity_check(filename)
|
62
|
+
|
63
|
+
path = path(filename)
|
64
|
+
|
65
|
+
if File.exist? path
|
66
|
+
FileUtils.rm path
|
67
|
+
end
|
68
|
+
|
69
|
+
nil
|
70
|
+
end
|
71
|
+
|
72
|
+
end
|
@@ -0,0 +1,69 @@
|
|
1
|
+
require 'rbbt/util/open'
|
2
|
+
require 'rbbt/util/arrayHash'
|
3
|
+
|
4
|
+
module Index
|
5
|
+
|
6
|
+
# Creates an inverse index. Takes a file with rows of elements
|
7
|
+
# separated by a given pattern (specified by +sep+) and returns a hash
|
8
|
+
# where each element points to the first element in the row. +lexicon+
|
9
|
+
# is the file containing the data.
|
10
|
+
def self.index(lexicon, options = {})
|
11
|
+
options = {:sep => "\t|\\|", :case_sensitive => true}.merge(options)
|
12
|
+
|
13
|
+
|
14
|
+
data = Open.to_hash(lexicon, options)
|
15
|
+
if options[:clean]
|
16
|
+
data = ArrayHash.clean(data)
|
17
|
+
end
|
18
|
+
|
19
|
+
index = {}
|
20
|
+
|
21
|
+
data.each{|code, id_lists|
|
22
|
+
next if code.nil? || code == ""
|
23
|
+
id_lists.flatten.compact.uniq.each{|id|
|
24
|
+
id = id.downcase unless options[:case_sensitive]
|
25
|
+
index[id] = code
|
26
|
+
}
|
27
|
+
}
|
28
|
+
data.each{|code, id_lists|
|
29
|
+
next if code.nil? || code == ""
|
30
|
+
id = code
|
31
|
+
id = id.downcase unless options[:case_sensitive]
|
32
|
+
index[id] = code
|
33
|
+
}
|
34
|
+
|
35
|
+
if !options[:case_sensitive]
|
36
|
+
class << index; self; end.instance_eval{
|
37
|
+
alias_method :old_get, :[]
|
38
|
+
define_method(:[], proc{|key| old_get(key.to_s.downcase)})
|
39
|
+
}
|
40
|
+
end
|
41
|
+
|
42
|
+
index
|
43
|
+
end
|
44
|
+
end
|
45
|
+
|
46
|
+
if __FILE__ == $0
|
47
|
+
|
48
|
+
require 'benchmark'
|
49
|
+
|
50
|
+
normal = nil
|
51
|
+
puts "Normal " + Benchmark.measure{
|
52
|
+
normal = Index.index('/home/miki/rbbt/data/organisms/human/identifiers',:trie => false, :case_sensitive => false)
|
53
|
+
}.to_s
|
54
|
+
|
55
|
+
|
56
|
+
ids = Open.read('/home/miki/git/MARQ/test/GDS1375_malignant_vs_normal_up.genes').collect{|l| l.chomp.strip.upcase}
|
57
|
+
|
58
|
+
new = nil
|
59
|
+
|
60
|
+
puts ids.inspect
|
61
|
+
puts "normal " + Benchmark.measure{
|
62
|
+
100.times{
|
63
|
+
new = ids.collect{|id| normal[id]}
|
64
|
+
}
|
65
|
+
}.to_s
|
66
|
+
|
67
|
+
puts new.inspect
|
68
|
+
|
69
|
+
end
|
@@ -0,0 +1,101 @@
|
|
1
|
+
require 'rbbt'
|
2
|
+
require 'rbbt/util/open'
|
3
|
+
|
4
|
+
$consonants = Open.read(File.join(Rbbt.datadir, 'wordlists/consonants')).collect{|l| l.chomp}.uniq
|
5
|
+
class String
|
6
|
+
# Uses heuristics to checks if a string seems like a special word, like a gene name.
|
7
|
+
def is_special?
|
8
|
+
# Only consonants
|
9
|
+
return true if self =~ /^[bcdfghjklmnpqrstvwxz]+$/i
|
10
|
+
|
11
|
+
# Not a word
|
12
|
+
return false if self =~ /[^\s]\s[^\s]/;
|
13
|
+
return false if self.length < 3;
|
14
|
+
# Alphanumeric
|
15
|
+
return true if self =~ /[0-9]/ && self =~ /[a-z]/i
|
16
|
+
# All Caps
|
17
|
+
return true if self =~ /[A-Z]{2,}/;
|
18
|
+
# Caps Mix
|
19
|
+
return true if self =~ /[a-z][A-Z]/;
|
20
|
+
# All consonants
|
21
|
+
return true if self =~ /^[a-z]$/i && self !~ /[aeiou]/i
|
22
|
+
# Dashed word
|
23
|
+
return true if self =~ /(^\w-|-\w$)/
|
24
|
+
# To many consonants (very heuristic)
|
25
|
+
if self =~ /([^aeiouy]{3,})/i && !$consonants.include?($1.downcase)
|
26
|
+
return true
|
27
|
+
end
|
28
|
+
|
29
|
+
return false
|
30
|
+
end
|
31
|
+
|
32
|
+
# Turns the first letter to lowercase
|
33
|
+
def downcase_first
|
34
|
+
return "" if self == ""
|
35
|
+
letters = self.scan(/./)
|
36
|
+
letters[0].downcase!
|
37
|
+
letters.join("")
|
38
|
+
end
|
39
|
+
|
40
|
+
# Turns a roman number into arabic form is possible. Just simple
|
41
|
+
# romans only...
|
42
|
+
def arabic
|
43
|
+
return 1 if self =~ /^I$/;
|
44
|
+
return 2 if self =~ /^II$/;
|
45
|
+
return 3 if self =~ /^III$/;
|
46
|
+
return 4 if self =~ /^IV$/;
|
47
|
+
return 5 if self =~ /^V$/;
|
48
|
+
return 10 if self =~ /^X$/;
|
49
|
+
|
50
|
+
return nil
|
51
|
+
end
|
52
|
+
end
|
53
|
+
|
54
|
+
|
55
|
+
|
56
|
+
|
57
|
+
$greek = {
|
58
|
+
"alpha" => "a",
|
59
|
+
"beta" => "b",
|
60
|
+
"gamma" => "g",
|
61
|
+
"delta" => "d",
|
62
|
+
"epsilon" => "e",
|
63
|
+
"zeta" => "z",
|
64
|
+
"eta" => "e",
|
65
|
+
"theta" => "th",
|
66
|
+
"iota" => "i",
|
67
|
+
"kappa" => "k",
|
68
|
+
"lambda" => "l",
|
69
|
+
"mu" => "m",
|
70
|
+
"nu" => "n",
|
71
|
+
"xi" => "x",
|
72
|
+
"omicron" => "o",
|
73
|
+
"pi" => "p",
|
74
|
+
"rho" => "r",
|
75
|
+
"sigma" => "s",
|
76
|
+
"tau" => "t",
|
77
|
+
"upsilon" => "u",
|
78
|
+
"phi" => "ph",
|
79
|
+
"chi" => "ch",
|
80
|
+
"psi" => "ps",
|
81
|
+
"omega" => "o"
|
82
|
+
}
|
83
|
+
|
84
|
+
$inverse_greek = Hash.new
|
85
|
+
$greek.each{|l,s| $inverse_greek[s] = l }
|
86
|
+
$stopwords = Open.read(File.join(Rbbt.datadir, 'wordlists/stopwords')).scan(/\w+/)
|
87
|
+
|
88
|
+
class Array
|
89
|
+
|
90
|
+
# Divides the array into +num+ chunks of the same size by placing one
|
91
|
+
# element in each chunk iteratively.
|
92
|
+
def chunk(num)
|
93
|
+
chunks = []
|
94
|
+
each_with_index{|e, i|
|
95
|
+
c = i % num
|
96
|
+
chunks[c] ||=[]
|
97
|
+
chunks[c] << e
|
98
|
+
}
|
99
|
+
chunks
|
100
|
+
end
|
101
|
+
end
|