rbbt 1.1.7 → 2.0.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/README.rdoc +2 -138
- metadata +72 -136
- data/LICENSE +0 -20
- data/bin/rbbt_config +0 -246
- data/install_scripts/classifier/R/classify.R +0 -36
- data/install_scripts/classifier/Rakefile +0 -145
- data/install_scripts/get_abner.sh +0 -2
- data/install_scripts/get_banner.sh +0 -25
- data/install_scripts/get_biocreative.sh +0 -72
- data/install_scripts/get_crf++.sh +0 -26
- data/install_scripts/get_entrez.sh +0 -4
- data/install_scripts/get_go.sh +0 -4
- data/install_scripts/get_polysearch.sh +0 -8
- data/install_scripts/ner/Rakefile +0 -206
- data/install_scripts/ner/config/default.rb +0 -52
- data/install_scripts/norm/Rakefile +0 -219
- data/install_scripts/norm/config/cue_default.rb +0 -10
- data/install_scripts/norm/config/tokens_default.rb +0 -79
- data/install_scripts/norm/functions.sh +0 -23
- data/install_scripts/organisms/Rakefile +0 -43
- data/install_scripts/organisms/cgd.Rakefile +0 -84
- data/install_scripts/organisms/human.Rakefile +0 -145
- data/install_scripts/organisms/mgi.Rakefile +0 -77
- data/install_scripts/organisms/pombe.Rakefile +0 -40
- data/install_scripts/organisms/rake-include.rb +0 -258
- data/install_scripts/organisms/rgd.Rakefile +0 -88
- data/install_scripts/organisms/sgd.Rakefile +0 -66
- data/install_scripts/organisms/tair.Rakefile +0 -54
- data/install_scripts/organisms/worm.Rakefile +0 -109
- data/install_scripts/wordlists/consonants +0 -897
- data/install_scripts/wordlists/stopwords +0 -1
- data/lib/rbbt.rb +0 -86
- data/lib/rbbt/bow/bow.rb +0 -88
- data/lib/rbbt/bow/classifier.rb +0 -116
- data/lib/rbbt/bow/dictionary.rb +0 -187
- data/lib/rbbt/ner/abner.rb +0 -34
- data/lib/rbbt/ner/banner.rb +0 -73
- data/lib/rbbt/ner/dictionaryNER.rb +0 -98
- data/lib/rbbt/ner/regexpNER.rb +0 -70
- data/lib/rbbt/ner/rner.rb +0 -227
- data/lib/rbbt/ner/rnorm.rb +0 -143
- data/lib/rbbt/ner/rnorm/cue_index.rb +0 -80
- data/lib/rbbt/ner/rnorm/tokens.rb +0 -213
- data/lib/rbbt/sources/biocreative.rb +0 -75
- data/lib/rbbt/sources/biomart.rb +0 -105
- data/lib/rbbt/sources/entrez.rb +0 -211
- data/lib/rbbt/sources/go.rb +0 -40
- data/lib/rbbt/sources/organism.rb +0 -245
- data/lib/rbbt/sources/polysearch.rb +0 -117
- data/lib/rbbt/sources/pubmed.rb +0 -111
- data/lib/rbbt/util/arrayHash.rb +0 -255
- data/lib/rbbt/util/filecache.rb +0 -72
- data/lib/rbbt/util/index.rb +0 -47
- data/lib/rbbt/util/misc.rb +0 -106
- data/lib/rbbt/util/open.rb +0 -235
- data/lib/rbbt/util/rake.rb +0 -183
- data/lib/rbbt/util/simpleDSL.rb +0 -87
- data/lib/rbbt/util/tmpfile.rb +0 -19
- data/tasks/install.rake +0 -124
data/lib/rbbt/util/filecache.rb
DELETED
@@ -1,72 +0,0 @@
|
|
1
|
-
require 'fileutils'
|
2
|
-
require 'rbbt'
|
3
|
-
|
4
|
-
# Provides caching functionality for files downloaded from the internet
|
5
|
-
module FileCache
|
6
|
-
|
7
|
-
class BadPathError < StandardError; end
|
8
|
-
class FileExistsError < StandardError; end
|
9
|
-
|
10
|
-
private
|
11
|
-
|
12
|
-
# Remove slash characters from filename.
|
13
|
-
def self.clean_path(filename)
|
14
|
-
filename.gsub(/\//,'_SLASH_')
|
15
|
-
end
|
16
|
-
|
17
|
-
# Check that the file name is safe and is in the correct format
|
18
|
-
def self.sanity_check(filename)
|
19
|
-
if filename =~ /\//
|
20
|
-
raise FileCache::BadPathError, "Character / not allowed in name: #{ filename }"
|
21
|
-
end
|
22
|
-
if filename !~ /.+\..+/
|
23
|
-
raise FileCache::BadPathError, "Filename '#{filename}' must have name and extension: name.ext"
|
24
|
-
end
|
25
|
-
end
|
26
|
-
|
27
|
-
public
|
28
|
-
|
29
|
-
# Find the path that a particular file would have in the cache
|
30
|
-
def self.path(filename)
|
31
|
-
sanity_check(filename)
|
32
|
-
|
33
|
-
name, extension = filename.match(/(.+)\.(.+)/).values_at(1,2)
|
34
|
-
dirs = name.scan(/./).reverse.values_at(0,1,2,3,4).reverse.compact.join('/')
|
35
|
-
|
36
|
-
return File.join(File.join(Rbbt.cachedir,dirs),filename)
|
37
|
-
end
|
38
|
-
|
39
|
-
# Add a file in the cache. Raise exception if exists, unless force is
|
40
|
-
# used.
|
41
|
-
def self.add_file(filename, content, options = {})
|
42
|
-
sanity_check(filename)
|
43
|
-
|
44
|
-
path = path(filename)
|
45
|
-
FileUtils.makedirs(File.dirname(path), :mode => 0777)
|
46
|
-
|
47
|
-
if File.exist?(path) and ! (options[:force] || options['force'])
|
48
|
-
raise FileCache::FileExistsError, "File #{filename} already in cache"
|
49
|
-
end
|
50
|
-
|
51
|
-
File.open(path,'w'){|f|
|
52
|
-
f.write(content)
|
53
|
-
}
|
54
|
-
FileUtils.chmod 0666, path
|
55
|
-
|
56
|
-
nil
|
57
|
-
end
|
58
|
-
|
59
|
-
# Removes the file from cache
|
60
|
-
def self.del_file(filename)
|
61
|
-
sanity_check(filename)
|
62
|
-
|
63
|
-
path = path(filename)
|
64
|
-
|
65
|
-
if File.exist? path
|
66
|
-
FileUtils.rm path
|
67
|
-
end
|
68
|
-
|
69
|
-
nil
|
70
|
-
end
|
71
|
-
|
72
|
-
end
|
data/lib/rbbt/util/index.rb
DELETED
@@ -1,47 +0,0 @@
|
|
1
|
-
require 'rbbt/util/open'
|
2
|
-
require 'rbbt/util/arrayHash'
|
3
|
-
|
4
|
-
module Index
|
5
|
-
|
6
|
-
# Creates an inverse index. Takes a file with rows of elements
|
7
|
-
# separated by a given pattern (specified by +sep+) and returns a hash
|
8
|
-
# where each element points to the first element in the row. +lexicon+
|
9
|
-
# is the file containing the data.
|
10
|
-
def self.index(lexicon, options = {})
|
11
|
-
options = {:sep => "\t|\\|", :case_sensitive => true}.merge(options)
|
12
|
-
|
13
|
-
|
14
|
-
data = Open.to_hash(lexicon, options)
|
15
|
-
if options[:clean]
|
16
|
-
data = ArrayHash.clean(data)
|
17
|
-
end
|
18
|
-
|
19
|
-
index = {}
|
20
|
-
|
21
|
-
data.each{|code, id_lists|
|
22
|
-
next if code.nil? || code == ""
|
23
|
-
id_lists.flatten.compact.uniq.each{|id|
|
24
|
-
id = id.downcase unless options[:case_sensitive]
|
25
|
-
index[id] = code
|
26
|
-
}
|
27
|
-
}
|
28
|
-
data.each{|code, id_lists|
|
29
|
-
next if code.nil? || code == ""
|
30
|
-
id = code
|
31
|
-
id = id.downcase unless options[:case_sensitive]
|
32
|
-
index[id] = code
|
33
|
-
}
|
34
|
-
|
35
|
-
if !options[:case_sensitive]
|
36
|
-
class << index; self; end.instance_eval{
|
37
|
-
alias_method :old_get, :[]
|
38
|
-
define_method(:[], proc{|key| old_get(key.to_s.downcase)})
|
39
|
-
|
40
|
-
alias_method :old_values_at, :values_at
|
41
|
-
define_method(:values_at, proc{|*keys| old_values_at(*keys.collect{|key| key.to_s.downcase }) })
|
42
|
-
}
|
43
|
-
end
|
44
|
-
|
45
|
-
index
|
46
|
-
end
|
47
|
-
end
|
data/lib/rbbt/util/misc.rb
DELETED
@@ -1,106 +0,0 @@
|
|
1
|
-
require 'rbbt'
|
2
|
-
require 'rbbt/util/open'
|
3
|
-
|
4
|
-
class String
|
5
|
-
CONSONANTS = []
|
6
|
-
if File.exists? File.join(Rbbt.datadir, 'wordlists/consonants')
|
7
|
-
Object::Open.read(File.join(Rbbt.datadir, 'wordlists/consonants')).each_line{|l| CONSONANTS << l.chomp}
|
8
|
-
end
|
9
|
-
|
10
|
-
# Uses heuristics to checks if a string seems like a special word, like a gene name.
|
11
|
-
def is_special?
|
12
|
-
# Only consonants
|
13
|
-
return true if self =~ /^[bcdfghjklmnpqrstvwxz]+$/i
|
14
|
-
|
15
|
-
# Not a word
|
16
|
-
return false if self =~ /[^\s]\s[^\s]/;
|
17
|
-
return false if self.length < 3;
|
18
|
-
# Alphanumeric
|
19
|
-
return true if self =~ /[0-9]/ && self =~ /[a-z]/i
|
20
|
-
# All Caps
|
21
|
-
return true if self =~ /[A-Z]{2,}/;
|
22
|
-
# Caps Mix
|
23
|
-
return true if self =~ /[a-z][A-Z]/;
|
24
|
-
# All consonants
|
25
|
-
return true if self =~ /^[a-z]$/i && self !~ /[aeiou]/i
|
26
|
-
# Dashed word
|
27
|
-
return true if self =~ /(^\w-|-\w$)/
|
28
|
-
# To many consonants (very heuristic)
|
29
|
-
if self =~ /([^aeiouy]{3,})/i && !CONSONANTS.include?($1.downcase)
|
30
|
-
return true
|
31
|
-
end
|
32
|
-
|
33
|
-
return false
|
34
|
-
end
|
35
|
-
|
36
|
-
# Turns the first letter to lowercase
|
37
|
-
def downcase_first
|
38
|
-
return "" if self == ""
|
39
|
-
letters = self.scan(/./)
|
40
|
-
letters[0].downcase!
|
41
|
-
letters.join("")
|
42
|
-
end
|
43
|
-
|
44
|
-
# Turns a roman number into arabic form is possible. Just simple
|
45
|
-
# romans only...
|
46
|
-
def arabic
|
47
|
-
return 1 if self =~ /^I$/;
|
48
|
-
return 2 if self =~ /^II$/;
|
49
|
-
return 3 if self =~ /^III$/;
|
50
|
-
return 4 if self =~ /^IV$/;
|
51
|
-
return 5 if self =~ /^V$/;
|
52
|
-
return 10 if self =~ /^X$/;
|
53
|
-
|
54
|
-
return nil
|
55
|
-
end
|
56
|
-
end
|
57
|
-
|
58
|
-
|
59
|
-
|
60
|
-
|
61
|
-
$greek = {
|
62
|
-
"alpha" => "a",
|
63
|
-
"beta" => "b",
|
64
|
-
"gamma" => "g",
|
65
|
-
"delta" => "d",
|
66
|
-
"epsilon" => "e",
|
67
|
-
"zeta" => "z",
|
68
|
-
"eta" => "e",
|
69
|
-
"theta" => "th",
|
70
|
-
"iota" => "i",
|
71
|
-
"kappa" => "k",
|
72
|
-
"lambda" => "l",
|
73
|
-
"mu" => "m",
|
74
|
-
"nu" => "n",
|
75
|
-
"xi" => "x",
|
76
|
-
"omicron" => "o",
|
77
|
-
"pi" => "p",
|
78
|
-
"rho" => "r",
|
79
|
-
"sigma" => "s",
|
80
|
-
"tau" => "t",
|
81
|
-
"upsilon" => "u",
|
82
|
-
"phi" => "ph",
|
83
|
-
"chi" => "ch",
|
84
|
-
"psi" => "ps",
|
85
|
-
"omega" => "o"
|
86
|
-
}
|
87
|
-
|
88
|
-
$inverse_greek = Hash.new
|
89
|
-
$greek.each{|l,s| $inverse_greek[s] = l }
|
90
|
-
|
91
|
-
$stopwords = Open.read(File.join(Rbbt.datadir, 'wordlists/stopwords')).scan(/\w+/) if File.exists? File.join(Rbbt.datadir, 'wordlists/stopwords')
|
92
|
-
|
93
|
-
class Array
|
94
|
-
|
95
|
-
# Divides the array into +num+ chunks of the same size by placing one
|
96
|
-
# element in each chunk iteratively.
|
97
|
-
def chunk(num)
|
98
|
-
chunks = []
|
99
|
-
each_with_index{|e, i|
|
100
|
-
c = i % num
|
101
|
-
chunks[c] ||=[]
|
102
|
-
chunks[c] << e
|
103
|
-
}
|
104
|
-
chunks
|
105
|
-
end
|
106
|
-
end
|
data/lib/rbbt/util/open.rb
DELETED
@@ -1,235 +0,0 @@
|
|
1
|
-
require 'rbbt'
|
2
|
-
require 'rbbt/util/tmpfile'
|
3
|
-
|
4
|
-
|
5
|
-
# Provides with a few helper functions to read and write files, as well # as
|
6
|
-
# for accessing remote files. It supports caching the files.
|
7
|
-
module Open
|
8
|
-
|
9
|
-
def self.fields(line, sep = "\t")
|
10
|
-
chunks = line.chomp.split(/(#{sep})/).select{|c| c !~ /^#{sep}$/ }
|
11
|
-
if line =~ /#{sep}$/
|
12
|
-
chunks << ""
|
13
|
-
end
|
14
|
-
chunks
|
15
|
-
end
|
16
|
-
|
17
|
-
class DirectoryNotFoundError < StandardError; end
|
18
|
-
class OpenURLError < StandardError; end
|
19
|
-
|
20
|
-
private
|
21
|
-
|
22
|
-
@@remote_cachedir = File.join(Rbbt.cachedir, 'open-remote/')
|
23
|
-
FileUtils.mkdir @@remote_cachedir unless File.exist? @@remote_cachedir
|
24
|
-
|
25
|
-
# If no data is specified and the url is found in the cache the saved
|
26
|
-
# contents are returned, if not found, the url is opened and the contents of
|
27
|
-
# that are returned. If +data+ is specified then it is saved in the
|
28
|
-
# cache under +url+. To match +url+ in the cache a MD5 digest is used.
|
29
|
-
# The location of the cache directory is bu default
|
30
|
-
# File.join(Rbbt.cachedir, 'open-remote/').
|
31
|
-
def self.cache(url, data = nil)
|
32
|
-
require 'digest/md5'
|
33
|
-
digest = Digest::MD5.hexdigest(url)
|
34
|
-
|
35
|
-
if data
|
36
|
-
Open.write(File.join(@@remote_cachedir, digest), data)
|
37
|
-
return nil
|
38
|
-
else
|
39
|
-
if File.exist? File.join(@@remote_cachedir, digest)
|
40
|
-
return File.open(File.join(@@remote_cachedir, digest)){|file| file.read }
|
41
|
-
else
|
42
|
-
return nil
|
43
|
-
end
|
44
|
-
end
|
45
|
-
end
|
46
|
-
|
47
|
-
# Checks if +url+ is a remote file.
|
48
|
-
def self.remote(url)
|
49
|
-
url =~ /^(?:http|ssh|https|ftp):\/\//
|
50
|
-
end
|
51
|
-
|
52
|
-
|
53
|
-
# Checks if +url+ is a gzip file.
|
54
|
-
def self.gziped(url)
|
55
|
-
if remote(url)
|
56
|
-
return url =~ /\.gz$/ || url =~ /\.gz\?.*$/
|
57
|
-
else
|
58
|
-
return url =~ /\.gz$/
|
59
|
-
end
|
60
|
-
end
|
61
|
-
|
62
|
-
|
63
|
-
@@last_time = Time.now
|
64
|
-
def self.wait(lag = 0)
|
65
|
-
time = Time.now
|
66
|
-
|
67
|
-
if time < @@last_time + lag
|
68
|
-
sleep @@last_time + lag - time
|
69
|
-
end
|
70
|
-
|
71
|
-
@@last_time = Time.now
|
72
|
-
end
|
73
|
-
|
74
|
-
public
|
75
|
-
# Reads the file specified by url. If the url es local it just opens
|
76
|
-
# the file, if it is remote if checks the cache first. In any case, it
|
77
|
-
# unzips gzip files automatically.
|
78
|
-
#
|
79
|
-
# Options:
|
80
|
-
# * :quiet => Do not print the progress of downloads
|
81
|
-
# * :nocache => do not use the cache.
|
82
|
-
# * :nice => secconds to wait between online queries
|
83
|
-
#
|
84
|
-
def self.read(url, options = {})
|
85
|
-
|
86
|
-
case
|
87
|
-
when remote(url)
|
88
|
-
if !options[:nocache] && data = cache(url)
|
89
|
-
return data
|
90
|
-
end
|
91
|
-
|
92
|
-
wait(options[:nice]) if options[:nice]
|
93
|
-
tmp = TmpFile.tmp_file("open-")
|
94
|
-
`wget -O #{tmp} '#{url}' #{options[:quiet] ? '-q' : '' }`
|
95
|
-
|
96
|
-
if $?.success?
|
97
|
-
if gziped(url)
|
98
|
-
`mv #{tmp} #{tmp}.gz; gunzip #{tmp}`
|
99
|
-
end
|
100
|
-
|
101
|
-
cache(url, File.open(tmp){|file| file.read}) unless options[:nocache]
|
102
|
-
|
103
|
-
data = File.open(tmp){|file| file.read}
|
104
|
-
FileUtils.rm tmp
|
105
|
-
return data
|
106
|
-
else
|
107
|
-
raise OpenURLError, "Error reading remote url: #{ url }"
|
108
|
-
end
|
109
|
-
|
110
|
-
when IO === url
|
111
|
-
url.read
|
112
|
-
else
|
113
|
-
return File.open(url){|file| file.read}
|
114
|
-
end
|
115
|
-
|
116
|
-
end
|
117
|
-
|
118
|
-
# Writes the contents on the path specified by filename
|
119
|
-
#
|
120
|
-
# Options:
|
121
|
-
# * :force => Create directories if missing.
|
122
|
-
def self.write(filename, content, options = {})
|
123
|
-
if !File.exist? File.dirname(filename)
|
124
|
-
if options[:force]
|
125
|
-
FileUtils.makedirs(File.dirname(filename))
|
126
|
-
else
|
127
|
-
raise Open::DirectoryNotFoundError, "Directory #{File.dirname(filename)} was not found"
|
128
|
-
end
|
129
|
-
end
|
130
|
-
|
131
|
-
File.open(filename,'w'){|f|
|
132
|
-
f.write content
|
133
|
-
}
|
134
|
-
|
135
|
-
nil
|
136
|
-
end
|
137
|
-
|
138
|
-
# Writes the contents on the path specified by filename. If the file
|
139
|
-
# is present it appends the contents.
|
140
|
-
#
|
141
|
-
# Options:
|
142
|
-
# * :force => Create directories if missing.
|
143
|
-
def self.append(filename, content, options ={})
|
144
|
-
if !File.exist? File.dirname(filename)
|
145
|
-
if options[:force]
|
146
|
-
FileUtils.makedirs(File.dirname(filename))
|
147
|
-
else
|
148
|
-
raise Open::DirectoryNotFoundError, "Directory #{File.dirname(filename)} was not found"
|
149
|
-
end
|
150
|
-
end
|
151
|
-
|
152
|
-
f = File.open(filename,'a')
|
153
|
-
f.write content
|
154
|
-
f.close
|
155
|
-
|
156
|
-
nil
|
157
|
-
end
|
158
|
-
|
159
|
-
|
160
|
-
|
161
|
-
# Reads a file with rows with elementes separated by a given pattern
|
162
|
-
# and builds a hash with it. The keys of the hash are the elements in
|
163
|
-
# the :native positions, by default the first (0). The value for each
|
164
|
-
# key is an array with one position for each of the rest possible
|
165
|
-
# positions specified in :extra, by default all but the :native. Since
|
166
|
-
# the native key may be repeated, each of the positions of the values
|
167
|
-
# is in itself an array. There are a number of options to change this
|
168
|
-
# behaviour.
|
169
|
-
#
|
170
|
-
# Options:
|
171
|
-
# * :native => position of the elements that will constitute the keys. By default 0.
|
172
|
-
# * :extra => positions of the rest of elements. By default all but :native. It can be an array of positions or a single position.
|
173
|
-
# * :sep => pattern to use in splitting the lines into elements, by default "\t"
|
174
|
-
# * :flatten => flatten the array of arrays that hold the values for each key into a simple array.
|
175
|
-
# * :single => for each key select only the first of the values, instead of the complete array.
|
176
|
-
# * :fix => A Proc that is called to pre-process the line
|
177
|
-
# * :exclude => A Proc that is called to check if the line must be excluded from the process.
|
178
|
-
def self.to_hash(filename, options = {})
|
179
|
-
native = options[:native] || 0
|
180
|
-
extra = options[:extra]
|
181
|
-
exclude = options[:exclude]
|
182
|
-
fix = options[:fix]
|
183
|
-
sep = options[:sep] || "\t"
|
184
|
-
single = options[:single]
|
185
|
-
single = false if single.nil?
|
186
|
-
flatten = options[:flatten] || single
|
187
|
-
flatten = single if flatten.nil?
|
188
|
-
|
189
|
-
extra = [extra] if extra && ! extra.is_a?( Array)
|
190
|
-
|
191
|
-
data = {}
|
192
|
-
Open.read(filename).each_line{|l|
|
193
|
-
l = fix.call(l) if fix
|
194
|
-
next if exclude and exclude.call(l)
|
195
|
-
|
196
|
-
row_fields = self.fields(l, sep)
|
197
|
-
id = row_fields[native]
|
198
|
-
next if id.nil? || id == ""
|
199
|
-
|
200
|
-
data[id] ||= []
|
201
|
-
if extra
|
202
|
-
fields = extra
|
203
|
-
else
|
204
|
-
fields = (0..(row_fields.length - 1)).to_a - [native]
|
205
|
-
end
|
206
|
-
fields.each_with_index{|pos,i|
|
207
|
-
data[id][i] ||= []
|
208
|
-
data[id][i] << row_fields[pos]
|
209
|
-
}
|
210
|
-
}
|
211
|
-
|
212
|
-
if flatten
|
213
|
-
data.each{|key, values|
|
214
|
-
if values
|
215
|
-
values.flatten!
|
216
|
-
values.collect!{|v|
|
217
|
-
if v != ""
|
218
|
-
v
|
219
|
-
else
|
220
|
-
nil
|
221
|
-
end
|
222
|
-
}
|
223
|
-
values.compact!
|
224
|
-
else
|
225
|
-
nil
|
226
|
-
end
|
227
|
-
}
|
228
|
-
end
|
229
|
-
|
230
|
-
data = Hash[*(data.collect{|key,values| [key, values.first]}).flatten] if single
|
231
|
-
|
232
|
-
data
|
233
|
-
end
|
234
|
-
|
235
|
-
end
|