rbbt 1.1.7 → 2.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/README.rdoc +2 -138
- metadata +72 -136
- data/LICENSE +0 -20
- data/bin/rbbt_config +0 -246
- data/install_scripts/classifier/R/classify.R +0 -36
- data/install_scripts/classifier/Rakefile +0 -145
- data/install_scripts/get_abner.sh +0 -2
- data/install_scripts/get_banner.sh +0 -25
- data/install_scripts/get_biocreative.sh +0 -72
- data/install_scripts/get_crf++.sh +0 -26
- data/install_scripts/get_entrez.sh +0 -4
- data/install_scripts/get_go.sh +0 -4
- data/install_scripts/get_polysearch.sh +0 -8
- data/install_scripts/ner/Rakefile +0 -206
- data/install_scripts/ner/config/default.rb +0 -52
- data/install_scripts/norm/Rakefile +0 -219
- data/install_scripts/norm/config/cue_default.rb +0 -10
- data/install_scripts/norm/config/tokens_default.rb +0 -79
- data/install_scripts/norm/functions.sh +0 -23
- data/install_scripts/organisms/Rakefile +0 -43
- data/install_scripts/organisms/cgd.Rakefile +0 -84
- data/install_scripts/organisms/human.Rakefile +0 -145
- data/install_scripts/organisms/mgi.Rakefile +0 -77
- data/install_scripts/organisms/pombe.Rakefile +0 -40
- data/install_scripts/organisms/rake-include.rb +0 -258
- data/install_scripts/organisms/rgd.Rakefile +0 -88
- data/install_scripts/organisms/sgd.Rakefile +0 -66
- data/install_scripts/organisms/tair.Rakefile +0 -54
- data/install_scripts/organisms/worm.Rakefile +0 -109
- data/install_scripts/wordlists/consonants +0 -897
- data/install_scripts/wordlists/stopwords +0 -1
- data/lib/rbbt.rb +0 -86
- data/lib/rbbt/bow/bow.rb +0 -88
- data/lib/rbbt/bow/classifier.rb +0 -116
- data/lib/rbbt/bow/dictionary.rb +0 -187
- data/lib/rbbt/ner/abner.rb +0 -34
- data/lib/rbbt/ner/banner.rb +0 -73
- data/lib/rbbt/ner/dictionaryNER.rb +0 -98
- data/lib/rbbt/ner/regexpNER.rb +0 -70
- data/lib/rbbt/ner/rner.rb +0 -227
- data/lib/rbbt/ner/rnorm.rb +0 -143
- data/lib/rbbt/ner/rnorm/cue_index.rb +0 -80
- data/lib/rbbt/ner/rnorm/tokens.rb +0 -213
- data/lib/rbbt/sources/biocreative.rb +0 -75
- data/lib/rbbt/sources/biomart.rb +0 -105
- data/lib/rbbt/sources/entrez.rb +0 -211
- data/lib/rbbt/sources/go.rb +0 -40
- data/lib/rbbt/sources/organism.rb +0 -245
- data/lib/rbbt/sources/polysearch.rb +0 -117
- data/lib/rbbt/sources/pubmed.rb +0 -111
- data/lib/rbbt/util/arrayHash.rb +0 -255
- data/lib/rbbt/util/filecache.rb +0 -72
- data/lib/rbbt/util/index.rb +0 -47
- data/lib/rbbt/util/misc.rb +0 -106
- data/lib/rbbt/util/open.rb +0 -235
- data/lib/rbbt/util/rake.rb +0 -183
- data/lib/rbbt/util/simpleDSL.rb +0 -87
- data/lib/rbbt/util/tmpfile.rb +0 -19
- data/tasks/install.rake +0 -124
data/lib/rbbt/util/filecache.rb
DELETED
@@ -1,72 +0,0 @@
|
|
1
|
-
require 'fileutils'
|
2
|
-
require 'rbbt'
|
3
|
-
|
4
|
-
# Provides caching functionality for files downloaded from the internet
|
5
|
-
module FileCache
|
6
|
-
|
7
|
-
class BadPathError < StandardError; end
|
8
|
-
class FileExistsError < StandardError; end
|
9
|
-
|
10
|
-
private
|
11
|
-
|
12
|
-
# Remove slash characters from filename.
|
13
|
-
def self.clean_path(filename)
|
14
|
-
filename.gsub(/\//,'_SLASH_')
|
15
|
-
end
|
16
|
-
|
17
|
-
# Check that the file name is safe and is in the correct format
|
18
|
-
def self.sanity_check(filename)
|
19
|
-
if filename =~ /\//
|
20
|
-
raise FileCache::BadPathError, "Character / not allowed in name: #{ filename }"
|
21
|
-
end
|
22
|
-
if filename !~ /.+\..+/
|
23
|
-
raise FileCache::BadPathError, "Filename '#{filename}' must have name and extension: name.ext"
|
24
|
-
end
|
25
|
-
end
|
26
|
-
|
27
|
-
public
|
28
|
-
|
29
|
-
# Find the path that a particular file would have in the cache
|
30
|
-
def self.path(filename)
|
31
|
-
sanity_check(filename)
|
32
|
-
|
33
|
-
name, extension = filename.match(/(.+)\.(.+)/).values_at(1,2)
|
34
|
-
dirs = name.scan(/./).reverse.values_at(0,1,2,3,4).reverse.compact.join('/')
|
35
|
-
|
36
|
-
return File.join(File.join(Rbbt.cachedir,dirs),filename)
|
37
|
-
end
|
38
|
-
|
39
|
-
# Add a file in the cache. Raise exception if exists, unless force is
|
40
|
-
# used.
|
41
|
-
def self.add_file(filename, content, options = {})
|
42
|
-
sanity_check(filename)
|
43
|
-
|
44
|
-
path = path(filename)
|
45
|
-
FileUtils.makedirs(File.dirname(path), :mode => 0777)
|
46
|
-
|
47
|
-
if File.exist?(path) and ! (options[:force] || options['force'])
|
48
|
-
raise FileCache::FileExistsError, "File #{filename} already in cache"
|
49
|
-
end
|
50
|
-
|
51
|
-
File.open(path,'w'){|f|
|
52
|
-
f.write(content)
|
53
|
-
}
|
54
|
-
FileUtils.chmod 0666, path
|
55
|
-
|
56
|
-
nil
|
57
|
-
end
|
58
|
-
|
59
|
-
# Removes the file from cache
|
60
|
-
def self.del_file(filename)
|
61
|
-
sanity_check(filename)
|
62
|
-
|
63
|
-
path = path(filename)
|
64
|
-
|
65
|
-
if File.exist? path
|
66
|
-
FileUtils.rm path
|
67
|
-
end
|
68
|
-
|
69
|
-
nil
|
70
|
-
end
|
71
|
-
|
72
|
-
end
|
data/lib/rbbt/util/index.rb
DELETED
@@ -1,47 +0,0 @@
|
|
1
|
-
require 'rbbt/util/open'
|
2
|
-
require 'rbbt/util/arrayHash'
|
3
|
-
|
4
|
-
module Index
|
5
|
-
|
6
|
-
# Creates an inverse index. Takes a file with rows of elements
|
7
|
-
# separated by a given pattern (specified by +sep+) and returns a hash
|
8
|
-
# where each element points to the first element in the row. +lexicon+
|
9
|
-
# is the file containing the data.
|
10
|
-
def self.index(lexicon, options = {})
|
11
|
-
options = {:sep => "\t|\\|", :case_sensitive => true}.merge(options)
|
12
|
-
|
13
|
-
|
14
|
-
data = Open.to_hash(lexicon, options)
|
15
|
-
if options[:clean]
|
16
|
-
data = ArrayHash.clean(data)
|
17
|
-
end
|
18
|
-
|
19
|
-
index = {}
|
20
|
-
|
21
|
-
data.each{|code, id_lists|
|
22
|
-
next if code.nil? || code == ""
|
23
|
-
id_lists.flatten.compact.uniq.each{|id|
|
24
|
-
id = id.downcase unless options[:case_sensitive]
|
25
|
-
index[id] = code
|
26
|
-
}
|
27
|
-
}
|
28
|
-
data.each{|code, id_lists|
|
29
|
-
next if code.nil? || code == ""
|
30
|
-
id = code
|
31
|
-
id = id.downcase unless options[:case_sensitive]
|
32
|
-
index[id] = code
|
33
|
-
}
|
34
|
-
|
35
|
-
if !options[:case_sensitive]
|
36
|
-
class << index; self; end.instance_eval{
|
37
|
-
alias_method :old_get, :[]
|
38
|
-
define_method(:[], proc{|key| old_get(key.to_s.downcase)})
|
39
|
-
|
40
|
-
alias_method :old_values_at, :values_at
|
41
|
-
define_method(:values_at, proc{|*keys| old_values_at(*keys.collect{|key| key.to_s.downcase }) })
|
42
|
-
}
|
43
|
-
end
|
44
|
-
|
45
|
-
index
|
46
|
-
end
|
47
|
-
end
|
data/lib/rbbt/util/misc.rb
DELETED
@@ -1,106 +0,0 @@
|
|
1
|
-
require 'rbbt'
|
2
|
-
require 'rbbt/util/open'
|
3
|
-
|
4
|
-
class String
|
5
|
-
CONSONANTS = []
|
6
|
-
if File.exists? File.join(Rbbt.datadir, 'wordlists/consonants')
|
7
|
-
Object::Open.read(File.join(Rbbt.datadir, 'wordlists/consonants')).each_line{|l| CONSONANTS << l.chomp}
|
8
|
-
end
|
9
|
-
|
10
|
-
# Uses heuristics to checks if a string seems like a special word, like a gene name.
|
11
|
-
def is_special?
|
12
|
-
# Only consonants
|
13
|
-
return true if self =~ /^[bcdfghjklmnpqrstvwxz]+$/i
|
14
|
-
|
15
|
-
# Not a word
|
16
|
-
return false if self =~ /[^\s]\s[^\s]/;
|
17
|
-
return false if self.length < 3;
|
18
|
-
# Alphanumeric
|
19
|
-
return true if self =~ /[0-9]/ && self =~ /[a-z]/i
|
20
|
-
# All Caps
|
21
|
-
return true if self =~ /[A-Z]{2,}/;
|
22
|
-
# Caps Mix
|
23
|
-
return true if self =~ /[a-z][A-Z]/;
|
24
|
-
# All consonants
|
25
|
-
return true if self =~ /^[a-z]$/i && self !~ /[aeiou]/i
|
26
|
-
# Dashed word
|
27
|
-
return true if self =~ /(^\w-|-\w$)/
|
28
|
-
# To many consonants (very heuristic)
|
29
|
-
if self =~ /([^aeiouy]{3,})/i && !CONSONANTS.include?($1.downcase)
|
30
|
-
return true
|
31
|
-
end
|
32
|
-
|
33
|
-
return false
|
34
|
-
end
|
35
|
-
|
36
|
-
# Turns the first letter to lowercase
|
37
|
-
def downcase_first
|
38
|
-
return "" if self == ""
|
39
|
-
letters = self.scan(/./)
|
40
|
-
letters[0].downcase!
|
41
|
-
letters.join("")
|
42
|
-
end
|
43
|
-
|
44
|
-
# Turns a roman number into arabic form is possible. Just simple
|
45
|
-
# romans only...
|
46
|
-
def arabic
|
47
|
-
return 1 if self =~ /^I$/;
|
48
|
-
return 2 if self =~ /^II$/;
|
49
|
-
return 3 if self =~ /^III$/;
|
50
|
-
return 4 if self =~ /^IV$/;
|
51
|
-
return 5 if self =~ /^V$/;
|
52
|
-
return 10 if self =~ /^X$/;
|
53
|
-
|
54
|
-
return nil
|
55
|
-
end
|
56
|
-
end
|
57
|
-
|
58
|
-
|
59
|
-
|
60
|
-
|
61
|
-
$greek = {
|
62
|
-
"alpha" => "a",
|
63
|
-
"beta" => "b",
|
64
|
-
"gamma" => "g",
|
65
|
-
"delta" => "d",
|
66
|
-
"epsilon" => "e",
|
67
|
-
"zeta" => "z",
|
68
|
-
"eta" => "e",
|
69
|
-
"theta" => "th",
|
70
|
-
"iota" => "i",
|
71
|
-
"kappa" => "k",
|
72
|
-
"lambda" => "l",
|
73
|
-
"mu" => "m",
|
74
|
-
"nu" => "n",
|
75
|
-
"xi" => "x",
|
76
|
-
"omicron" => "o",
|
77
|
-
"pi" => "p",
|
78
|
-
"rho" => "r",
|
79
|
-
"sigma" => "s",
|
80
|
-
"tau" => "t",
|
81
|
-
"upsilon" => "u",
|
82
|
-
"phi" => "ph",
|
83
|
-
"chi" => "ch",
|
84
|
-
"psi" => "ps",
|
85
|
-
"omega" => "o"
|
86
|
-
}
|
87
|
-
|
88
|
-
$inverse_greek = Hash.new
|
89
|
-
$greek.each{|l,s| $inverse_greek[s] = l }
|
90
|
-
|
91
|
-
$stopwords = Open.read(File.join(Rbbt.datadir, 'wordlists/stopwords')).scan(/\w+/) if File.exists? File.join(Rbbt.datadir, 'wordlists/stopwords')
|
92
|
-
|
93
|
-
class Array
|
94
|
-
|
95
|
-
# Divides the array into +num+ chunks of the same size by placing one
|
96
|
-
# element in each chunk iteratively.
|
97
|
-
def chunk(num)
|
98
|
-
chunks = []
|
99
|
-
each_with_index{|e, i|
|
100
|
-
c = i % num
|
101
|
-
chunks[c] ||=[]
|
102
|
-
chunks[c] << e
|
103
|
-
}
|
104
|
-
chunks
|
105
|
-
end
|
106
|
-
end
|
data/lib/rbbt/util/open.rb
DELETED
@@ -1,235 +0,0 @@
|
|
1
|
-
require 'rbbt'
|
2
|
-
require 'rbbt/util/tmpfile'
|
3
|
-
|
4
|
-
|
5
|
-
# Provides with a few helper functions to read and write files, as well # as
|
6
|
-
# for accessing remote files. It supports caching the files.
|
7
|
-
module Open
|
8
|
-
|
9
|
-
def self.fields(line, sep = "\t")
|
10
|
-
chunks = line.chomp.split(/(#{sep})/).select{|c| c !~ /^#{sep}$/ }
|
11
|
-
if line =~ /#{sep}$/
|
12
|
-
chunks << ""
|
13
|
-
end
|
14
|
-
chunks
|
15
|
-
end
|
16
|
-
|
17
|
-
class DirectoryNotFoundError < StandardError; end
|
18
|
-
class OpenURLError < StandardError; end
|
19
|
-
|
20
|
-
private
|
21
|
-
|
22
|
-
@@remote_cachedir = File.join(Rbbt.cachedir, 'open-remote/')
|
23
|
-
FileUtils.mkdir @@remote_cachedir unless File.exist? @@remote_cachedir
|
24
|
-
|
25
|
-
# If no data is specified and the url is found in the cache the saved
|
26
|
-
# contents are returned, if not found, the url is opened and the contents of
|
27
|
-
# that are returned. If +data+ is specified then it is saved in the
|
28
|
-
# cache under +url+. To match +url+ in the cache a MD5 digest is used.
|
29
|
-
# The location of the cache directory is bu default
|
30
|
-
# File.join(Rbbt.cachedir, 'open-remote/').
|
31
|
-
def self.cache(url, data = nil)
|
32
|
-
require 'digest/md5'
|
33
|
-
digest = Digest::MD5.hexdigest(url)
|
34
|
-
|
35
|
-
if data
|
36
|
-
Open.write(File.join(@@remote_cachedir, digest), data)
|
37
|
-
return nil
|
38
|
-
else
|
39
|
-
if File.exist? File.join(@@remote_cachedir, digest)
|
40
|
-
return File.open(File.join(@@remote_cachedir, digest)){|file| file.read }
|
41
|
-
else
|
42
|
-
return nil
|
43
|
-
end
|
44
|
-
end
|
45
|
-
end
|
46
|
-
|
47
|
-
# Checks if +url+ is a remote file.
|
48
|
-
def self.remote(url)
|
49
|
-
url =~ /^(?:http|ssh|https|ftp):\/\//
|
50
|
-
end
|
51
|
-
|
52
|
-
|
53
|
-
# Checks if +url+ is a gzip file.
|
54
|
-
def self.gziped(url)
|
55
|
-
if remote(url)
|
56
|
-
return url =~ /\.gz$/ || url =~ /\.gz\?.*$/
|
57
|
-
else
|
58
|
-
return url =~ /\.gz$/
|
59
|
-
end
|
60
|
-
end
|
61
|
-
|
62
|
-
|
63
|
-
@@last_time = Time.now
|
64
|
-
def self.wait(lag = 0)
|
65
|
-
time = Time.now
|
66
|
-
|
67
|
-
if time < @@last_time + lag
|
68
|
-
sleep @@last_time + lag - time
|
69
|
-
end
|
70
|
-
|
71
|
-
@@last_time = Time.now
|
72
|
-
end
|
73
|
-
|
74
|
-
public
|
75
|
-
# Reads the file specified by url. If the url es local it just opens
|
76
|
-
# the file, if it is remote if checks the cache first. In any case, it
|
77
|
-
# unzips gzip files automatically.
|
78
|
-
#
|
79
|
-
# Options:
|
80
|
-
# * :quiet => Do not print the progress of downloads
|
81
|
-
# * :nocache => do not use the cache.
|
82
|
-
# * :nice => secconds to wait between online queries
|
83
|
-
#
|
84
|
-
def self.read(url, options = {})
|
85
|
-
|
86
|
-
case
|
87
|
-
when remote(url)
|
88
|
-
if !options[:nocache] && data = cache(url)
|
89
|
-
return data
|
90
|
-
end
|
91
|
-
|
92
|
-
wait(options[:nice]) if options[:nice]
|
93
|
-
tmp = TmpFile.tmp_file("open-")
|
94
|
-
`wget -O #{tmp} '#{url}' #{options[:quiet] ? '-q' : '' }`
|
95
|
-
|
96
|
-
if $?.success?
|
97
|
-
if gziped(url)
|
98
|
-
`mv #{tmp} #{tmp}.gz; gunzip #{tmp}`
|
99
|
-
end
|
100
|
-
|
101
|
-
cache(url, File.open(tmp){|file| file.read}) unless options[:nocache]
|
102
|
-
|
103
|
-
data = File.open(tmp){|file| file.read}
|
104
|
-
FileUtils.rm tmp
|
105
|
-
return data
|
106
|
-
else
|
107
|
-
raise OpenURLError, "Error reading remote url: #{ url }"
|
108
|
-
end
|
109
|
-
|
110
|
-
when IO === url
|
111
|
-
url.read
|
112
|
-
else
|
113
|
-
return File.open(url){|file| file.read}
|
114
|
-
end
|
115
|
-
|
116
|
-
end
|
117
|
-
|
118
|
-
# Writes the contents on the path specified by filename
|
119
|
-
#
|
120
|
-
# Options:
|
121
|
-
# * :force => Create directories if missing.
|
122
|
-
def self.write(filename, content, options = {})
|
123
|
-
if !File.exist? File.dirname(filename)
|
124
|
-
if options[:force]
|
125
|
-
FileUtils.makedirs(File.dirname(filename))
|
126
|
-
else
|
127
|
-
raise Open::DirectoryNotFoundError, "Directory #{File.dirname(filename)} was not found"
|
128
|
-
end
|
129
|
-
end
|
130
|
-
|
131
|
-
File.open(filename,'w'){|f|
|
132
|
-
f.write content
|
133
|
-
}
|
134
|
-
|
135
|
-
nil
|
136
|
-
end
|
137
|
-
|
138
|
-
# Writes the contents on the path specified by filename. If the file
|
139
|
-
# is present it appends the contents.
|
140
|
-
#
|
141
|
-
# Options:
|
142
|
-
# * :force => Create directories if missing.
|
143
|
-
def self.append(filename, content, options ={})
|
144
|
-
if !File.exist? File.dirname(filename)
|
145
|
-
if options[:force]
|
146
|
-
FileUtils.makedirs(File.dirname(filename))
|
147
|
-
else
|
148
|
-
raise Open::DirectoryNotFoundError, "Directory #{File.dirname(filename)} was not found"
|
149
|
-
end
|
150
|
-
end
|
151
|
-
|
152
|
-
f = File.open(filename,'a')
|
153
|
-
f.write content
|
154
|
-
f.close
|
155
|
-
|
156
|
-
nil
|
157
|
-
end
|
158
|
-
|
159
|
-
|
160
|
-
|
161
|
-
# Reads a file with rows with elementes separated by a given pattern
|
162
|
-
# and builds a hash with it. The keys of the hash are the elements in
|
163
|
-
# the :native positions, by default the first (0). The value for each
|
164
|
-
# key is an array with one position for each of the rest possible
|
165
|
-
# positions specified in :extra, by default all but the :native. Since
|
166
|
-
# the native key may be repeated, each of the positions of the values
|
167
|
-
# is in itself an array. There are a number of options to change this
|
168
|
-
# behaviour.
|
169
|
-
#
|
170
|
-
# Options:
|
171
|
-
# * :native => position of the elements that will constitute the keys. By default 0.
|
172
|
-
# * :extra => positions of the rest of elements. By default all but :native. It can be an array of positions or a single position.
|
173
|
-
# * :sep => pattern to use in splitting the lines into elements, by default "\t"
|
174
|
-
# * :flatten => flatten the array of arrays that hold the values for each key into a simple array.
|
175
|
-
# * :single => for each key select only the first of the values, instead of the complete array.
|
176
|
-
# * :fix => A Proc that is called to pre-process the line
|
177
|
-
# * :exclude => A Proc that is called to check if the line must be excluded from the process.
|
178
|
-
def self.to_hash(filename, options = {})
|
179
|
-
native = options[:native] || 0
|
180
|
-
extra = options[:extra]
|
181
|
-
exclude = options[:exclude]
|
182
|
-
fix = options[:fix]
|
183
|
-
sep = options[:sep] || "\t"
|
184
|
-
single = options[:single]
|
185
|
-
single = false if single.nil?
|
186
|
-
flatten = options[:flatten] || single
|
187
|
-
flatten = single if flatten.nil?
|
188
|
-
|
189
|
-
extra = [extra] if extra && ! extra.is_a?( Array)
|
190
|
-
|
191
|
-
data = {}
|
192
|
-
Open.read(filename).each_line{|l|
|
193
|
-
l = fix.call(l) if fix
|
194
|
-
next if exclude and exclude.call(l)
|
195
|
-
|
196
|
-
row_fields = self.fields(l, sep)
|
197
|
-
id = row_fields[native]
|
198
|
-
next if id.nil? || id == ""
|
199
|
-
|
200
|
-
data[id] ||= []
|
201
|
-
if extra
|
202
|
-
fields = extra
|
203
|
-
else
|
204
|
-
fields = (0..(row_fields.length - 1)).to_a - [native]
|
205
|
-
end
|
206
|
-
fields.each_with_index{|pos,i|
|
207
|
-
data[id][i] ||= []
|
208
|
-
data[id][i] << row_fields[pos]
|
209
|
-
}
|
210
|
-
}
|
211
|
-
|
212
|
-
if flatten
|
213
|
-
data.each{|key, values|
|
214
|
-
if values
|
215
|
-
values.flatten!
|
216
|
-
values.collect!{|v|
|
217
|
-
if v != ""
|
218
|
-
v
|
219
|
-
else
|
220
|
-
nil
|
221
|
-
end
|
222
|
-
}
|
223
|
-
values.compact!
|
224
|
-
else
|
225
|
-
nil
|
226
|
-
end
|
227
|
-
}
|
228
|
-
end
|
229
|
-
|
230
|
-
data = Hash[*(data.collect{|key,values| [key, values.first]}).flatten] if single
|
231
|
-
|
232
|
-
data
|
233
|
-
end
|
234
|
-
|
235
|
-
end
|