rbbt 1.1.7 → 2.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (60) hide show
  1. checksums.yaml +7 -0
  2. data/README.rdoc +2 -138
  3. metadata +72 -136
  4. data/LICENSE +0 -20
  5. data/bin/rbbt_config +0 -246
  6. data/install_scripts/classifier/R/classify.R +0 -36
  7. data/install_scripts/classifier/Rakefile +0 -145
  8. data/install_scripts/get_abner.sh +0 -2
  9. data/install_scripts/get_banner.sh +0 -25
  10. data/install_scripts/get_biocreative.sh +0 -72
  11. data/install_scripts/get_crf++.sh +0 -26
  12. data/install_scripts/get_entrez.sh +0 -4
  13. data/install_scripts/get_go.sh +0 -4
  14. data/install_scripts/get_polysearch.sh +0 -8
  15. data/install_scripts/ner/Rakefile +0 -206
  16. data/install_scripts/ner/config/default.rb +0 -52
  17. data/install_scripts/norm/Rakefile +0 -219
  18. data/install_scripts/norm/config/cue_default.rb +0 -10
  19. data/install_scripts/norm/config/tokens_default.rb +0 -79
  20. data/install_scripts/norm/functions.sh +0 -23
  21. data/install_scripts/organisms/Rakefile +0 -43
  22. data/install_scripts/organisms/cgd.Rakefile +0 -84
  23. data/install_scripts/organisms/human.Rakefile +0 -145
  24. data/install_scripts/organisms/mgi.Rakefile +0 -77
  25. data/install_scripts/organisms/pombe.Rakefile +0 -40
  26. data/install_scripts/organisms/rake-include.rb +0 -258
  27. data/install_scripts/organisms/rgd.Rakefile +0 -88
  28. data/install_scripts/organisms/sgd.Rakefile +0 -66
  29. data/install_scripts/organisms/tair.Rakefile +0 -54
  30. data/install_scripts/organisms/worm.Rakefile +0 -109
  31. data/install_scripts/wordlists/consonants +0 -897
  32. data/install_scripts/wordlists/stopwords +0 -1
  33. data/lib/rbbt.rb +0 -86
  34. data/lib/rbbt/bow/bow.rb +0 -88
  35. data/lib/rbbt/bow/classifier.rb +0 -116
  36. data/lib/rbbt/bow/dictionary.rb +0 -187
  37. data/lib/rbbt/ner/abner.rb +0 -34
  38. data/lib/rbbt/ner/banner.rb +0 -73
  39. data/lib/rbbt/ner/dictionaryNER.rb +0 -98
  40. data/lib/rbbt/ner/regexpNER.rb +0 -70
  41. data/lib/rbbt/ner/rner.rb +0 -227
  42. data/lib/rbbt/ner/rnorm.rb +0 -143
  43. data/lib/rbbt/ner/rnorm/cue_index.rb +0 -80
  44. data/lib/rbbt/ner/rnorm/tokens.rb +0 -213
  45. data/lib/rbbt/sources/biocreative.rb +0 -75
  46. data/lib/rbbt/sources/biomart.rb +0 -105
  47. data/lib/rbbt/sources/entrez.rb +0 -211
  48. data/lib/rbbt/sources/go.rb +0 -40
  49. data/lib/rbbt/sources/organism.rb +0 -245
  50. data/lib/rbbt/sources/polysearch.rb +0 -117
  51. data/lib/rbbt/sources/pubmed.rb +0 -111
  52. data/lib/rbbt/util/arrayHash.rb +0 -255
  53. data/lib/rbbt/util/filecache.rb +0 -72
  54. data/lib/rbbt/util/index.rb +0 -47
  55. data/lib/rbbt/util/misc.rb +0 -106
  56. data/lib/rbbt/util/open.rb +0 -235
  57. data/lib/rbbt/util/rake.rb +0 -183
  58. data/lib/rbbt/util/simpleDSL.rb +0 -87
  59. data/lib/rbbt/util/tmpfile.rb +0 -19
  60. data/tasks/install.rake +0 -124
@@ -1,72 +0,0 @@
1
- require 'fileutils'
2
- require 'rbbt'
3
-
4
- # Provides caching functionality for files downloaded from the internet
5
- module FileCache
6
-
7
- class BadPathError < StandardError; end
8
- class FileExistsError < StandardError; end
9
-
10
- private
11
-
12
- # Remove slash characters from filename.
13
- def self.clean_path(filename)
14
- filename.gsub(/\//,'_SLASH_')
15
- end
16
-
17
- # Check that the file name is safe and is in the correct format
18
- def self.sanity_check(filename)
19
- if filename =~ /\//
20
- raise FileCache::BadPathError, "Character / not allowed in name: #{ filename }"
21
- end
22
- if filename !~ /.+\..+/
23
- raise FileCache::BadPathError, "Filename '#{filename}' must have name and extension: name.ext"
24
- end
25
- end
26
-
27
- public
28
-
29
- # Find the path that a particular file would have in the cache
30
- def self.path(filename)
31
- sanity_check(filename)
32
-
33
- name, extension = filename.match(/(.+)\.(.+)/).values_at(1,2)
34
- dirs = name.scan(/./).reverse.values_at(0,1,2,3,4).reverse.compact.join('/')
35
-
36
- return File.join(File.join(Rbbt.cachedir,dirs),filename)
37
- end
38
-
39
- # Add a file in the cache. Raise exception if exists, unless force is
40
- # used.
41
- def self.add_file(filename, content, options = {})
42
- sanity_check(filename)
43
-
44
- path = path(filename)
45
- FileUtils.makedirs(File.dirname(path), :mode => 0777)
46
-
47
- if File.exist?(path) and ! (options[:force] || options['force'])
48
- raise FileCache::FileExistsError, "File #{filename} already in cache"
49
- end
50
-
51
- File.open(path,'w'){|f|
52
- f.write(content)
53
- }
54
- FileUtils.chmod 0666, path
55
-
56
- nil
57
- end
58
-
59
- # Removes the file from cache
60
- def self.del_file(filename)
61
- sanity_check(filename)
62
-
63
- path = path(filename)
64
-
65
- if File.exist? path
66
- FileUtils.rm path
67
- end
68
-
69
- nil
70
- end
71
-
72
- end
@@ -1,47 +0,0 @@
1
- require 'rbbt/util/open'
2
- require 'rbbt/util/arrayHash'
3
-
4
- module Index
5
-
6
- # Creates an inverse index. Takes a file with rows of elements
7
- # separated by a given pattern (specified by +sep+) and returns a hash
8
- # where each element points to the first element in the row. +lexicon+
9
- # is the file containing the data.
10
- def self.index(lexicon, options = {})
11
- options = {:sep => "\t|\\|", :case_sensitive => true}.merge(options)
12
-
13
-
14
- data = Open.to_hash(lexicon, options)
15
- if options[:clean]
16
- data = ArrayHash.clean(data)
17
- end
18
-
19
- index = {}
20
-
21
- data.each{|code, id_lists|
22
- next if code.nil? || code == ""
23
- id_lists.flatten.compact.uniq.each{|id|
24
- id = id.downcase unless options[:case_sensitive]
25
- index[id] = code
26
- }
27
- }
28
- data.each{|code, id_lists|
29
- next if code.nil? || code == ""
30
- id = code
31
- id = id.downcase unless options[:case_sensitive]
32
- index[id] = code
33
- }
34
-
35
- if !options[:case_sensitive]
36
- class << index; self; end.instance_eval{
37
- alias_method :old_get, :[]
38
- define_method(:[], proc{|key| old_get(key.to_s.downcase)})
39
-
40
- alias_method :old_values_at, :values_at
41
- define_method(:values_at, proc{|*keys| old_values_at(*keys.collect{|key| key.to_s.downcase }) })
42
- }
43
- end
44
-
45
- index
46
- end
47
- end
@@ -1,106 +0,0 @@
1
- require 'rbbt'
2
- require 'rbbt/util/open'
3
-
4
- class String
5
- CONSONANTS = []
6
- if File.exists? File.join(Rbbt.datadir, 'wordlists/consonants')
7
- Object::Open.read(File.join(Rbbt.datadir, 'wordlists/consonants')).each_line{|l| CONSONANTS << l.chomp}
8
- end
9
-
10
- # Uses heuristics to checks if a string seems like a special word, like a gene name.
11
- def is_special?
12
- # Only consonants
13
- return true if self =~ /^[bcdfghjklmnpqrstvwxz]+$/i
14
-
15
- # Not a word
16
- return false if self =~ /[^\s]\s[^\s]/;
17
- return false if self.length < 3;
18
- # Alphanumeric
19
- return true if self =~ /[0-9]/ && self =~ /[a-z]/i
20
- # All Caps
21
- return true if self =~ /[A-Z]{2,}/;
22
- # Caps Mix
23
- return true if self =~ /[a-z][A-Z]/;
24
- # All consonants
25
- return true if self =~ /^[a-z]$/i && self !~ /[aeiou]/i
26
- # Dashed word
27
- return true if self =~ /(^\w-|-\w$)/
28
- # To many consonants (very heuristic)
29
- if self =~ /([^aeiouy]{3,})/i && !CONSONANTS.include?($1.downcase)
30
- return true
31
- end
32
-
33
- return false
34
- end
35
-
36
- # Turns the first letter to lowercase
37
- def downcase_first
38
- return "" if self == ""
39
- letters = self.scan(/./)
40
- letters[0].downcase!
41
- letters.join("")
42
- end
43
-
44
- # Turns a roman number into arabic form is possible. Just simple
45
- # romans only...
46
- def arabic
47
- return 1 if self =~ /^I$/;
48
- return 2 if self =~ /^II$/;
49
- return 3 if self =~ /^III$/;
50
- return 4 if self =~ /^IV$/;
51
- return 5 if self =~ /^V$/;
52
- return 10 if self =~ /^X$/;
53
-
54
- return nil
55
- end
56
- end
57
-
58
-
59
-
60
-
61
- $greek = {
62
- "alpha" => "a",
63
- "beta" => "b",
64
- "gamma" => "g",
65
- "delta" => "d",
66
- "epsilon" => "e",
67
- "zeta" => "z",
68
- "eta" => "e",
69
- "theta" => "th",
70
- "iota" => "i",
71
- "kappa" => "k",
72
- "lambda" => "l",
73
- "mu" => "m",
74
- "nu" => "n",
75
- "xi" => "x",
76
- "omicron" => "o",
77
- "pi" => "p",
78
- "rho" => "r",
79
- "sigma" => "s",
80
- "tau" => "t",
81
- "upsilon" => "u",
82
- "phi" => "ph",
83
- "chi" => "ch",
84
- "psi" => "ps",
85
- "omega" => "o"
86
- }
87
-
88
- $inverse_greek = Hash.new
89
- $greek.each{|l,s| $inverse_greek[s] = l }
90
-
91
- $stopwords = Open.read(File.join(Rbbt.datadir, 'wordlists/stopwords')).scan(/\w+/) if File.exists? File.join(Rbbt.datadir, 'wordlists/stopwords')
92
-
93
- class Array
94
-
95
- # Divides the array into +num+ chunks of the same size by placing one
96
- # element in each chunk iteratively.
97
- def chunk(num)
98
- chunks = []
99
- each_with_index{|e, i|
100
- c = i % num
101
- chunks[c] ||=[]
102
- chunks[c] << e
103
- }
104
- chunks
105
- end
106
- end
@@ -1,235 +0,0 @@
1
- require 'rbbt'
2
- require 'rbbt/util/tmpfile'
3
-
4
-
5
- # Provides with a few helper functions to read and write files, as well # as
6
- # for accessing remote files. It supports caching the files.
7
- module Open
8
-
9
- def self.fields(line, sep = "\t")
10
- chunks = line.chomp.split(/(#{sep})/).select{|c| c !~ /^#{sep}$/ }
11
- if line =~ /#{sep}$/
12
- chunks << ""
13
- end
14
- chunks
15
- end
16
-
17
- class DirectoryNotFoundError < StandardError; end
18
- class OpenURLError < StandardError; end
19
-
20
- private
21
-
22
- @@remote_cachedir = File.join(Rbbt.cachedir, 'open-remote/')
23
- FileUtils.mkdir @@remote_cachedir unless File.exist? @@remote_cachedir
24
-
25
- # If no data is specified and the url is found in the cache the saved
26
- # contents are returned, if not found, the url is opened and the contents of
27
- # that are returned. If +data+ is specified then it is saved in the
28
- # cache under +url+. To match +url+ in the cache a MD5 digest is used.
29
- # The location of the cache directory is bu default
30
- # File.join(Rbbt.cachedir, 'open-remote/').
31
- def self.cache(url, data = nil)
32
- require 'digest/md5'
33
- digest = Digest::MD5.hexdigest(url)
34
-
35
- if data
36
- Open.write(File.join(@@remote_cachedir, digest), data)
37
- return nil
38
- else
39
- if File.exist? File.join(@@remote_cachedir, digest)
40
- return File.open(File.join(@@remote_cachedir, digest)){|file| file.read }
41
- else
42
- return nil
43
- end
44
- end
45
- end
46
-
47
- # Checks if +url+ is a remote file.
48
- def self.remote(url)
49
- url =~ /^(?:http|ssh|https|ftp):\/\//
50
- end
51
-
52
-
53
- # Checks if +url+ is a gzip file.
54
- def self.gziped(url)
55
- if remote(url)
56
- return url =~ /\.gz$/ || url =~ /\.gz\?.*$/
57
- else
58
- return url =~ /\.gz$/
59
- end
60
- end
61
-
62
-
63
- @@last_time = Time.now
64
- def self.wait(lag = 0)
65
- time = Time.now
66
-
67
- if time < @@last_time + lag
68
- sleep @@last_time + lag - time
69
- end
70
-
71
- @@last_time = Time.now
72
- end
73
-
74
- public
75
- # Reads the file specified by url. If the url es local it just opens
76
- # the file, if it is remote if checks the cache first. In any case, it
77
- # unzips gzip files automatically.
78
- #
79
- # Options:
80
- # * :quiet => Do not print the progress of downloads
81
- # * :nocache => do not use the cache.
82
- # * :nice => secconds to wait between online queries
83
- #
84
- def self.read(url, options = {})
85
-
86
- case
87
- when remote(url)
88
- if !options[:nocache] && data = cache(url)
89
- return data
90
- end
91
-
92
- wait(options[:nice]) if options[:nice]
93
- tmp = TmpFile.tmp_file("open-")
94
- `wget -O #{tmp} '#{url}' #{options[:quiet] ? '-q' : '' }`
95
-
96
- if $?.success?
97
- if gziped(url)
98
- `mv #{tmp} #{tmp}.gz; gunzip #{tmp}`
99
- end
100
-
101
- cache(url, File.open(tmp){|file| file.read}) unless options[:nocache]
102
-
103
- data = File.open(tmp){|file| file.read}
104
- FileUtils.rm tmp
105
- return data
106
- else
107
- raise OpenURLError, "Error reading remote url: #{ url }"
108
- end
109
-
110
- when IO === url
111
- url.read
112
- else
113
- return File.open(url){|file| file.read}
114
- end
115
-
116
- end
117
-
118
- # Writes the contents on the path specified by filename
119
- #
120
- # Options:
121
- # * :force => Create directories if missing.
122
- def self.write(filename, content, options = {})
123
- if !File.exist? File.dirname(filename)
124
- if options[:force]
125
- FileUtils.makedirs(File.dirname(filename))
126
- else
127
- raise Open::DirectoryNotFoundError, "Directory #{File.dirname(filename)} was not found"
128
- end
129
- end
130
-
131
- File.open(filename,'w'){|f|
132
- f.write content
133
- }
134
-
135
- nil
136
- end
137
-
138
- # Writes the contents on the path specified by filename. If the file
139
- # is present it appends the contents.
140
- #
141
- # Options:
142
- # * :force => Create directories if missing.
143
- def self.append(filename, content, options ={})
144
- if !File.exist? File.dirname(filename)
145
- if options[:force]
146
- FileUtils.makedirs(File.dirname(filename))
147
- else
148
- raise Open::DirectoryNotFoundError, "Directory #{File.dirname(filename)} was not found"
149
- end
150
- end
151
-
152
- f = File.open(filename,'a')
153
- f.write content
154
- f.close
155
-
156
- nil
157
- end
158
-
159
-
160
-
161
- # Reads a file with rows with elementes separated by a given pattern
162
- # and builds a hash with it. The keys of the hash are the elements in
163
- # the :native positions, by default the first (0). The value for each
164
- # key is an array with one position for each of the rest possible
165
- # positions specified in :extra, by default all but the :native. Since
166
- # the native key may be repeated, each of the positions of the values
167
- # is in itself an array. There are a number of options to change this
168
- # behaviour.
169
- #
170
- # Options:
171
- # * :native => position of the elements that will constitute the keys. By default 0.
172
- # * :extra => positions of the rest of elements. By default all but :native. It can be an array of positions or a single position.
173
- # * :sep => pattern to use in splitting the lines into elements, by default "\t"
174
- # * :flatten => flatten the array of arrays that hold the values for each key into a simple array.
175
- # * :single => for each key select only the first of the values, instead of the complete array.
176
- # * :fix => A Proc that is called to pre-process the line
177
- # * :exclude => A Proc that is called to check if the line must be excluded from the process.
178
- def self.to_hash(filename, options = {})
179
- native = options[:native] || 0
180
- extra = options[:extra]
181
- exclude = options[:exclude]
182
- fix = options[:fix]
183
- sep = options[:sep] || "\t"
184
- single = options[:single]
185
- single = false if single.nil?
186
- flatten = options[:flatten] || single
187
- flatten = single if flatten.nil?
188
-
189
- extra = [extra] if extra && ! extra.is_a?( Array)
190
-
191
- data = {}
192
- Open.read(filename).each_line{|l|
193
- l = fix.call(l) if fix
194
- next if exclude and exclude.call(l)
195
-
196
- row_fields = self.fields(l, sep)
197
- id = row_fields[native]
198
- next if id.nil? || id == ""
199
-
200
- data[id] ||= []
201
- if extra
202
- fields = extra
203
- else
204
- fields = (0..(row_fields.length - 1)).to_a - [native]
205
- end
206
- fields.each_with_index{|pos,i|
207
- data[id][i] ||= []
208
- data[id][i] << row_fields[pos]
209
- }
210
- }
211
-
212
- if flatten
213
- data.each{|key, values|
214
- if values
215
- values.flatten!
216
- values.collect!{|v|
217
- if v != ""
218
- v
219
- else
220
- nil
221
- end
222
- }
223
- values.compact!
224
- else
225
- nil
226
- end
227
- }
228
- end
229
-
230
- data = Hash[*(data.collect{|key,values| [key, values.first]}).flatten] if single
231
-
232
- data
233
- end
234
-
235
- end