rbbt 1.1.7 → 2.0.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (60) hide show
  1. checksums.yaml +7 -0
  2. data/README.rdoc +2 -138
  3. metadata +72 -136
  4. data/LICENSE +0 -20
  5. data/bin/rbbt_config +0 -246
  6. data/install_scripts/classifier/R/classify.R +0 -36
  7. data/install_scripts/classifier/Rakefile +0 -145
  8. data/install_scripts/get_abner.sh +0 -2
  9. data/install_scripts/get_banner.sh +0 -25
  10. data/install_scripts/get_biocreative.sh +0 -72
  11. data/install_scripts/get_crf++.sh +0 -26
  12. data/install_scripts/get_entrez.sh +0 -4
  13. data/install_scripts/get_go.sh +0 -4
  14. data/install_scripts/get_polysearch.sh +0 -8
  15. data/install_scripts/ner/Rakefile +0 -206
  16. data/install_scripts/ner/config/default.rb +0 -52
  17. data/install_scripts/norm/Rakefile +0 -219
  18. data/install_scripts/norm/config/cue_default.rb +0 -10
  19. data/install_scripts/norm/config/tokens_default.rb +0 -79
  20. data/install_scripts/norm/functions.sh +0 -23
  21. data/install_scripts/organisms/Rakefile +0 -43
  22. data/install_scripts/organisms/cgd.Rakefile +0 -84
  23. data/install_scripts/organisms/human.Rakefile +0 -145
  24. data/install_scripts/organisms/mgi.Rakefile +0 -77
  25. data/install_scripts/organisms/pombe.Rakefile +0 -40
  26. data/install_scripts/organisms/rake-include.rb +0 -258
  27. data/install_scripts/organisms/rgd.Rakefile +0 -88
  28. data/install_scripts/organisms/sgd.Rakefile +0 -66
  29. data/install_scripts/organisms/tair.Rakefile +0 -54
  30. data/install_scripts/organisms/worm.Rakefile +0 -109
  31. data/install_scripts/wordlists/consonants +0 -897
  32. data/install_scripts/wordlists/stopwords +0 -1
  33. data/lib/rbbt.rb +0 -86
  34. data/lib/rbbt/bow/bow.rb +0 -88
  35. data/lib/rbbt/bow/classifier.rb +0 -116
  36. data/lib/rbbt/bow/dictionary.rb +0 -187
  37. data/lib/rbbt/ner/abner.rb +0 -34
  38. data/lib/rbbt/ner/banner.rb +0 -73
  39. data/lib/rbbt/ner/dictionaryNER.rb +0 -98
  40. data/lib/rbbt/ner/regexpNER.rb +0 -70
  41. data/lib/rbbt/ner/rner.rb +0 -227
  42. data/lib/rbbt/ner/rnorm.rb +0 -143
  43. data/lib/rbbt/ner/rnorm/cue_index.rb +0 -80
  44. data/lib/rbbt/ner/rnorm/tokens.rb +0 -213
  45. data/lib/rbbt/sources/biocreative.rb +0 -75
  46. data/lib/rbbt/sources/biomart.rb +0 -105
  47. data/lib/rbbt/sources/entrez.rb +0 -211
  48. data/lib/rbbt/sources/go.rb +0 -40
  49. data/lib/rbbt/sources/organism.rb +0 -245
  50. data/lib/rbbt/sources/polysearch.rb +0 -117
  51. data/lib/rbbt/sources/pubmed.rb +0 -111
  52. data/lib/rbbt/util/arrayHash.rb +0 -255
  53. data/lib/rbbt/util/filecache.rb +0 -72
  54. data/lib/rbbt/util/index.rb +0 -47
  55. data/lib/rbbt/util/misc.rb +0 -106
  56. data/lib/rbbt/util/open.rb +0 -235
  57. data/lib/rbbt/util/rake.rb +0 -183
  58. data/lib/rbbt/util/simpleDSL.rb +0 -87
  59. data/lib/rbbt/util/tmpfile.rb +0 -19
  60. data/tasks/install.rake +0 -124
@@ -1,72 +0,0 @@
1
- require 'fileutils'
2
- require 'rbbt'
3
-
4
- # Provides caching functionality for files downloaded from the internet
5
- module FileCache
6
-
7
- class BadPathError < StandardError; end
8
- class FileExistsError < StandardError; end
9
-
10
- private
11
-
12
- # Remove slash characters from filename.
13
- def self.clean_path(filename)
14
- filename.gsub(/\//,'_SLASH_')
15
- end
16
-
17
- # Check that the file name is safe and is in the correct format
18
- def self.sanity_check(filename)
19
- if filename =~ /\//
20
- raise FileCache::BadPathError, "Character / not allowed in name: #{ filename }"
21
- end
22
- if filename !~ /.+\..+/
23
- raise FileCache::BadPathError, "Filename '#{filename}' must have name and extension: name.ext"
24
- end
25
- end
26
-
27
- public
28
-
29
- # Find the path that a particular file would have in the cache
30
- def self.path(filename)
31
- sanity_check(filename)
32
-
33
- name, extension = filename.match(/(.+)\.(.+)/).values_at(1,2)
34
- dirs = name.scan(/./).reverse.values_at(0,1,2,3,4).reverse.compact.join('/')
35
-
36
- return File.join(File.join(Rbbt.cachedir,dirs),filename)
37
- end
38
-
39
- # Add a file in the cache. Raise exception if exists, unless force is
40
- # used.
41
- def self.add_file(filename, content, options = {})
42
- sanity_check(filename)
43
-
44
- path = path(filename)
45
- FileUtils.makedirs(File.dirname(path), :mode => 0777)
46
-
47
- if File.exist?(path) and ! (options[:force] || options['force'])
48
- raise FileCache::FileExistsError, "File #{filename} already in cache"
49
- end
50
-
51
- File.open(path,'w'){|f|
52
- f.write(content)
53
- }
54
- FileUtils.chmod 0666, path
55
-
56
- nil
57
- end
58
-
59
- # Removes the file from cache
60
- def self.del_file(filename)
61
- sanity_check(filename)
62
-
63
- path = path(filename)
64
-
65
- if File.exist? path
66
- FileUtils.rm path
67
- end
68
-
69
- nil
70
- end
71
-
72
- end
@@ -1,47 +0,0 @@
1
- require 'rbbt/util/open'
2
- require 'rbbt/util/arrayHash'
3
-
4
- module Index
5
-
6
- # Creates an inverse index. Takes a file with rows of elements
7
- # separated by a given pattern (specified by +sep+) and returns a hash
8
- # where each element points to the first element in the row. +lexicon+
9
- # is the file containing the data.
10
- def self.index(lexicon, options = {})
11
- options = {:sep => "\t|\\|", :case_sensitive => true}.merge(options)
12
-
13
-
14
- data = Open.to_hash(lexicon, options)
15
- if options[:clean]
16
- data = ArrayHash.clean(data)
17
- end
18
-
19
- index = {}
20
-
21
- data.each{|code, id_lists|
22
- next if code.nil? || code == ""
23
- id_lists.flatten.compact.uniq.each{|id|
24
- id = id.downcase unless options[:case_sensitive]
25
- index[id] = code
26
- }
27
- }
28
- data.each{|code, id_lists|
29
- next if code.nil? || code == ""
30
- id = code
31
- id = id.downcase unless options[:case_sensitive]
32
- index[id] = code
33
- }
34
-
35
- if !options[:case_sensitive]
36
- class << index; self; end.instance_eval{
37
- alias_method :old_get, :[]
38
- define_method(:[], proc{|key| old_get(key.to_s.downcase)})
39
-
40
- alias_method :old_values_at, :values_at
41
- define_method(:values_at, proc{|*keys| old_values_at(*keys.collect{|key| key.to_s.downcase }) })
42
- }
43
- end
44
-
45
- index
46
- end
47
- end
@@ -1,106 +0,0 @@
1
- require 'rbbt'
2
- require 'rbbt/util/open'
3
-
4
- class String
5
- CONSONANTS = []
6
- if File.exists? File.join(Rbbt.datadir, 'wordlists/consonants')
7
- Object::Open.read(File.join(Rbbt.datadir, 'wordlists/consonants')).each_line{|l| CONSONANTS << l.chomp}
8
- end
9
-
10
- # Uses heuristics to checks if a string seems like a special word, like a gene name.
11
- def is_special?
12
- # Only consonants
13
- return true if self =~ /^[bcdfghjklmnpqrstvwxz]+$/i
14
-
15
- # Not a word
16
- return false if self =~ /[^\s]\s[^\s]/;
17
- return false if self.length < 3;
18
- # Alphanumeric
19
- return true if self =~ /[0-9]/ && self =~ /[a-z]/i
20
- # All Caps
21
- return true if self =~ /[A-Z]{2,}/;
22
- # Caps Mix
23
- return true if self =~ /[a-z][A-Z]/;
24
- # All consonants
25
- return true if self =~ /^[a-z]$/i && self !~ /[aeiou]/i
26
- # Dashed word
27
- return true if self =~ /(^\w-|-\w$)/
28
- # To many consonants (very heuristic)
29
- if self =~ /([^aeiouy]{3,})/i && !CONSONANTS.include?($1.downcase)
30
- return true
31
- end
32
-
33
- return false
34
- end
35
-
36
- # Turns the first letter to lowercase
37
- def downcase_first
38
- return "" if self == ""
39
- letters = self.scan(/./)
40
- letters[0].downcase!
41
- letters.join("")
42
- end
43
-
44
- # Turns a roman number into arabic form is possible. Just simple
45
- # romans only...
46
- def arabic
47
- return 1 if self =~ /^I$/;
48
- return 2 if self =~ /^II$/;
49
- return 3 if self =~ /^III$/;
50
- return 4 if self =~ /^IV$/;
51
- return 5 if self =~ /^V$/;
52
- return 10 if self =~ /^X$/;
53
-
54
- return nil
55
- end
56
- end
57
-
58
-
59
-
60
-
61
- $greek = {
62
- "alpha" => "a",
63
- "beta" => "b",
64
- "gamma" => "g",
65
- "delta" => "d",
66
- "epsilon" => "e",
67
- "zeta" => "z",
68
- "eta" => "e",
69
- "theta" => "th",
70
- "iota" => "i",
71
- "kappa" => "k",
72
- "lambda" => "l",
73
- "mu" => "m",
74
- "nu" => "n",
75
- "xi" => "x",
76
- "omicron" => "o",
77
- "pi" => "p",
78
- "rho" => "r",
79
- "sigma" => "s",
80
- "tau" => "t",
81
- "upsilon" => "u",
82
- "phi" => "ph",
83
- "chi" => "ch",
84
- "psi" => "ps",
85
- "omega" => "o"
86
- }
87
-
88
- $inverse_greek = Hash.new
89
- $greek.each{|l,s| $inverse_greek[s] = l }
90
-
91
- $stopwords = Open.read(File.join(Rbbt.datadir, 'wordlists/stopwords')).scan(/\w+/) if File.exists? File.join(Rbbt.datadir, 'wordlists/stopwords')
92
-
93
- class Array
94
-
95
- # Divides the array into +num+ chunks of the same size by placing one
96
- # element in each chunk iteratively.
97
- def chunk(num)
98
- chunks = []
99
- each_with_index{|e, i|
100
- c = i % num
101
- chunks[c] ||=[]
102
- chunks[c] << e
103
- }
104
- chunks
105
- end
106
- end
@@ -1,235 +0,0 @@
1
- require 'rbbt'
2
- require 'rbbt/util/tmpfile'
3
-
4
-
5
- # Provides with a few helper functions to read and write files, as well # as
6
- # for accessing remote files. It supports caching the files.
7
- module Open
8
-
9
- def self.fields(line, sep = "\t")
10
- chunks = line.chomp.split(/(#{sep})/).select{|c| c !~ /^#{sep}$/ }
11
- if line =~ /#{sep}$/
12
- chunks << ""
13
- end
14
- chunks
15
- end
16
-
17
- class DirectoryNotFoundError < StandardError; end
18
- class OpenURLError < StandardError; end
19
-
20
- private
21
-
22
- @@remote_cachedir = File.join(Rbbt.cachedir, 'open-remote/')
23
- FileUtils.mkdir @@remote_cachedir unless File.exist? @@remote_cachedir
24
-
25
- # If no data is specified and the url is found in the cache the saved
26
- # contents are returned, if not found, the url is opened and the contents of
27
- # that are returned. If +data+ is specified then it is saved in the
28
- # cache under +url+. To match +url+ in the cache a MD5 digest is used.
29
- # The location of the cache directory is bu default
30
- # File.join(Rbbt.cachedir, 'open-remote/').
31
- def self.cache(url, data = nil)
32
- require 'digest/md5'
33
- digest = Digest::MD5.hexdigest(url)
34
-
35
- if data
36
- Open.write(File.join(@@remote_cachedir, digest), data)
37
- return nil
38
- else
39
- if File.exist? File.join(@@remote_cachedir, digest)
40
- return File.open(File.join(@@remote_cachedir, digest)){|file| file.read }
41
- else
42
- return nil
43
- end
44
- end
45
- end
46
-
47
- # Checks if +url+ is a remote file.
48
- def self.remote(url)
49
- url =~ /^(?:http|ssh|https|ftp):\/\//
50
- end
51
-
52
-
53
- # Checks if +url+ is a gzip file.
54
- def self.gziped(url)
55
- if remote(url)
56
- return url =~ /\.gz$/ || url =~ /\.gz\?.*$/
57
- else
58
- return url =~ /\.gz$/
59
- end
60
- end
61
-
62
-
63
- @@last_time = Time.now
64
- def self.wait(lag = 0)
65
- time = Time.now
66
-
67
- if time < @@last_time + lag
68
- sleep @@last_time + lag - time
69
- end
70
-
71
- @@last_time = Time.now
72
- end
73
-
74
- public
75
- # Reads the file specified by url. If the url es local it just opens
76
- # the file, if it is remote if checks the cache first. In any case, it
77
- # unzips gzip files automatically.
78
- #
79
- # Options:
80
- # * :quiet => Do not print the progress of downloads
81
- # * :nocache => do not use the cache.
82
- # * :nice => secconds to wait between online queries
83
- #
84
- def self.read(url, options = {})
85
-
86
- case
87
- when remote(url)
88
- if !options[:nocache] && data = cache(url)
89
- return data
90
- end
91
-
92
- wait(options[:nice]) if options[:nice]
93
- tmp = TmpFile.tmp_file("open-")
94
- `wget -O #{tmp} '#{url}' #{options[:quiet] ? '-q' : '' }`
95
-
96
- if $?.success?
97
- if gziped(url)
98
- `mv #{tmp} #{tmp}.gz; gunzip #{tmp}`
99
- end
100
-
101
- cache(url, File.open(tmp){|file| file.read}) unless options[:nocache]
102
-
103
- data = File.open(tmp){|file| file.read}
104
- FileUtils.rm tmp
105
- return data
106
- else
107
- raise OpenURLError, "Error reading remote url: #{ url }"
108
- end
109
-
110
- when IO === url
111
- url.read
112
- else
113
- return File.open(url){|file| file.read}
114
- end
115
-
116
- end
117
-
118
- # Writes the contents on the path specified by filename
119
- #
120
- # Options:
121
- # * :force => Create directories if missing.
122
- def self.write(filename, content, options = {})
123
- if !File.exist? File.dirname(filename)
124
- if options[:force]
125
- FileUtils.makedirs(File.dirname(filename))
126
- else
127
- raise Open::DirectoryNotFoundError, "Directory #{File.dirname(filename)} was not found"
128
- end
129
- end
130
-
131
- File.open(filename,'w'){|f|
132
- f.write content
133
- }
134
-
135
- nil
136
- end
137
-
138
- # Writes the contents on the path specified by filename. If the file
139
- # is present it appends the contents.
140
- #
141
- # Options:
142
- # * :force => Create directories if missing.
143
- def self.append(filename, content, options ={})
144
- if !File.exist? File.dirname(filename)
145
- if options[:force]
146
- FileUtils.makedirs(File.dirname(filename))
147
- else
148
- raise Open::DirectoryNotFoundError, "Directory #{File.dirname(filename)} was not found"
149
- end
150
- end
151
-
152
- f = File.open(filename,'a')
153
- f.write content
154
- f.close
155
-
156
- nil
157
- end
158
-
159
-
160
-
161
- # Reads a file with rows with elementes separated by a given pattern
162
- # and builds a hash with it. The keys of the hash are the elements in
163
- # the :native positions, by default the first (0). The value for each
164
- # key is an array with one position for each of the rest possible
165
- # positions specified in :extra, by default all but the :native. Since
166
- # the native key may be repeated, each of the positions of the values
167
- # is in itself an array. There are a number of options to change this
168
- # behaviour.
169
- #
170
- # Options:
171
- # * :native => position of the elements that will constitute the keys. By default 0.
172
- # * :extra => positions of the rest of elements. By default all but :native. It can be an array of positions or a single position.
173
- # * :sep => pattern to use in splitting the lines into elements, by default "\t"
174
- # * :flatten => flatten the array of arrays that hold the values for each key into a simple array.
175
- # * :single => for each key select only the first of the values, instead of the complete array.
176
- # * :fix => A Proc that is called to pre-process the line
177
- # * :exclude => A Proc that is called to check if the line must be excluded from the process.
178
- def self.to_hash(filename, options = {})
179
- native = options[:native] || 0
180
- extra = options[:extra]
181
- exclude = options[:exclude]
182
- fix = options[:fix]
183
- sep = options[:sep] || "\t"
184
- single = options[:single]
185
- single = false if single.nil?
186
- flatten = options[:flatten] || single
187
- flatten = single if flatten.nil?
188
-
189
- extra = [extra] if extra && ! extra.is_a?( Array)
190
-
191
- data = {}
192
- Open.read(filename).each_line{|l|
193
- l = fix.call(l) if fix
194
- next if exclude and exclude.call(l)
195
-
196
- row_fields = self.fields(l, sep)
197
- id = row_fields[native]
198
- next if id.nil? || id == ""
199
-
200
- data[id] ||= []
201
- if extra
202
- fields = extra
203
- else
204
- fields = (0..(row_fields.length - 1)).to_a - [native]
205
- end
206
- fields.each_with_index{|pos,i|
207
- data[id][i] ||= []
208
- data[id][i] << row_fields[pos]
209
- }
210
- }
211
-
212
- if flatten
213
- data.each{|key, values|
214
- if values
215
- values.flatten!
216
- values.collect!{|v|
217
- if v != ""
218
- v
219
- else
220
- nil
221
- end
222
- }
223
- values.compact!
224
- else
225
- nil
226
- end
227
- }
228
- end
229
-
230
- data = Hash[*(data.collect{|key,values| [key, values.first]}).flatten] if single
231
-
232
- data
233
- end
234
-
235
- end