rbbt 1.2.5 → 2.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (88) hide show
  1. checksums.yaml +7 -0
  2. data/README.rdoc +2 -138
  3. metadata +69 -214
  4. data/LICENSE +0 -20
  5. data/bin/rbbt_config +0 -245
  6. data/install_scripts/classifier/R/classify.R +0 -36
  7. data/install_scripts/classifier/Rakefile +0 -140
  8. data/install_scripts/get_abner.sh +0 -2
  9. data/install_scripts/get_banner.sh +0 -25
  10. data/install_scripts/get_biocreative.sh +0 -72
  11. data/install_scripts/get_crf++.sh +0 -26
  12. data/install_scripts/get_entrez.sh +0 -4
  13. data/install_scripts/get_go.sh +0 -4
  14. data/install_scripts/get_polysearch.sh +0 -8
  15. data/install_scripts/ner/Rakefile +0 -206
  16. data/install_scripts/ner/config/default.rb +0 -52
  17. data/install_scripts/norm/Rakefile +0 -219
  18. data/install_scripts/norm/config/cue_default.rb +0 -10
  19. data/install_scripts/norm/config/tokens_default.rb +0 -86
  20. data/install_scripts/norm/functions.sh +0 -23
  21. data/install_scripts/organisms/Ath.Rakefile +0 -55
  22. data/install_scripts/organisms/Cal.Rakefile +0 -84
  23. data/install_scripts/organisms/Cel.Rakefile +0 -109
  24. data/install_scripts/organisms/Hsa.Rakefile +0 -140
  25. data/install_scripts/organisms/Mmu.Rakefile +0 -77
  26. data/install_scripts/organisms/Rakefile +0 -43
  27. data/install_scripts/organisms/Rno.Rakefile +0 -88
  28. data/install_scripts/organisms/Sce.Rakefile +0 -66
  29. data/install_scripts/organisms/Spo.Rakefile +0 -40
  30. data/install_scripts/organisms/rake-include.rb +0 -252
  31. data/install_scripts/wordlists/consonants +0 -897
  32. data/install_scripts/wordlists/stopwords +0 -1
  33. data/lib/rbbt.rb +0 -83
  34. data/lib/rbbt/bow/bow.rb +0 -88
  35. data/lib/rbbt/bow/classifier.rb +0 -116
  36. data/lib/rbbt/bow/dictionary.rb +0 -187
  37. data/lib/rbbt/ner/abner.rb +0 -34
  38. data/lib/rbbt/ner/banner.rb +0 -73
  39. data/lib/rbbt/ner/dictionaryNER.rb +0 -98
  40. data/lib/rbbt/ner/regexpNER.rb +0 -70
  41. data/lib/rbbt/ner/rner.rb +0 -227
  42. data/lib/rbbt/ner/rnorm.rb +0 -143
  43. data/lib/rbbt/ner/rnorm/cue_index.rb +0 -80
  44. data/lib/rbbt/ner/rnorm/tokens.rb +0 -217
  45. data/lib/rbbt/sources/biocreative.rb +0 -75
  46. data/lib/rbbt/sources/biomart.rb +0 -105
  47. data/lib/rbbt/sources/entrez.rb +0 -211
  48. data/lib/rbbt/sources/go.rb +0 -85
  49. data/lib/rbbt/sources/gscholar.rb +0 -74
  50. data/lib/rbbt/sources/organism.rb +0 -241
  51. data/lib/rbbt/sources/polysearch.rb +0 -117
  52. data/lib/rbbt/sources/pubmed.rb +0 -248
  53. data/lib/rbbt/util/arrayHash.rb +0 -266
  54. data/lib/rbbt/util/filecache.rb +0 -72
  55. data/lib/rbbt/util/index.rb +0 -47
  56. data/lib/rbbt/util/misc.rb +0 -106
  57. data/lib/rbbt/util/open.rb +0 -251
  58. data/lib/rbbt/util/rake.rb +0 -183
  59. data/lib/rbbt/util/simpleDSL.rb +0 -87
  60. data/lib/rbbt/util/tmpfile.rb +0 -35
  61. data/tasks/install.rake +0 -124
  62. data/test/rbbt/bow/test_bow.rb +0 -33
  63. data/test/rbbt/bow/test_classifier.rb +0 -72
  64. data/test/rbbt/bow/test_dictionary.rb +0 -91
  65. data/test/rbbt/ner/rnorm/test_cue_index.rb +0 -57
  66. data/test/rbbt/ner/rnorm/test_tokens.rb +0 -70
  67. data/test/rbbt/ner/test_abner.rb +0 -17
  68. data/test/rbbt/ner/test_banner.rb +0 -17
  69. data/test/rbbt/ner/test_dictionaryNER.rb +0 -122
  70. data/test/rbbt/ner/test_regexpNER.rb +0 -33
  71. data/test/rbbt/ner/test_rner.rb +0 -126
  72. data/test/rbbt/ner/test_rnorm.rb +0 -47
  73. data/test/rbbt/sources/test_biocreative.rb +0 -38
  74. data/test/rbbt/sources/test_biomart.rb +0 -31
  75. data/test/rbbt/sources/test_entrez.rb +0 -49
  76. data/test/rbbt/sources/test_go.rb +0 -24
  77. data/test/rbbt/sources/test_organism.rb +0 -59
  78. data/test/rbbt/sources/test_polysearch.rb +0 -27
  79. data/test/rbbt/sources/test_pubmed.rb +0 -39
  80. data/test/rbbt/util/test_arrayHash.rb +0 -257
  81. data/test/rbbt/util/test_filecache.rb +0 -37
  82. data/test/rbbt/util/test_index.rb +0 -31
  83. data/test/rbbt/util/test_misc.rb +0 -20
  84. data/test/rbbt/util/test_open.rb +0 -110
  85. data/test/rbbt/util/test_simpleDSL.rb +0 -57
  86. data/test/rbbt/util/test_tmpfile.rb +0 -21
  87. data/test/test_helper.rb +0 -4
  88. data/test/test_rbbt.rb +0 -11
@@ -1,266 +0,0 @@
1
-
2
- class ArrayHash
3
-
4
- def self.make_case_insensitive(hash)
5
- new = {}
6
- hash.each{|k,v|
7
- new[k.to_s.downcase] = v
8
- }
9
-
10
- class << new; self; end.instance_eval{
11
- alias_method :old_get, :[]
12
- define_method(:[], proc{|key| old_get(key.to_s.downcase)})
13
- }
14
-
15
- new
16
- end
17
-
18
- # Take two strings of elements separated by the character sep_char and join them
19
- # into one, removing repetitions.
20
- def self.merge_values_string(list1, list2, sep_char ='|')
21
- elem1 = list1.to_s.split(sep_char)
22
- elem2 = list2.to_s.split(sep_char)
23
- (elem1 + elem2).select{|e| e.to_s != ""}.uniq.join(sep_char)
24
- end
25
-
26
- # Merge two lists of elements. Elements could be strings of elements
27
- # separated by the character sep_char, or arrays of lists of such strings.
28
- def self.merge_values(list1, list2, sep_char = "|")
29
- if String === list1 || String === list2
30
- return merge_values_string(list1, list2)
31
- end
32
-
33
- if list1.nil?
34
- list1 = [''] * list2.length
35
- end
36
-
37
- if list2.nil?
38
- list2 = [''] * list1.length
39
- end
40
-
41
- new = []
42
- list1.each_with_index{|elem, i|
43
- new << merge_values_string(elem, list2[i], sep_char)
44
- }
45
- new
46
- end
47
-
48
-
49
- # Take an hash of arrays and a position and use the value at that position
50
- # of the arrays to build a new hash with that value as key, and the original
51
- # key prepended to the arrays. The options hash accepts the following keys
52
- # :case_insensitive, which defaults to true, and :index, which indicates that
53
- # the original key should be the value of the hash entry, instead of the
54
- # complete array of values.
55
- def self.pullout(hash, pos, options = {})
56
- index = options[:index]; index = false if index.nil?
57
- case_insensitive = options[:case_insensitive]; case_insensitive = true if case_insensitive.nil?
58
-
59
- new = {}
60
- hash.each{|key,values|
61
- code = values[pos].to_s
62
- next if code == ""
63
-
64
- if index
65
- list = key
66
- else
67
- list = [key] + values
68
- list.delete_at(pos + 1)
69
- end
70
-
71
- code.split("|").each{|c|
72
- c = c.downcase if case_insensitive
73
- new[c] = merge_values(new[c], list)
74
- }
75
- }
76
-
77
- new = make_case_insensitive new if case_insensitive
78
-
79
- new
80
- end
81
-
82
- # Merge one hash of arrays into another. Each hash contains a number of fields for each
83
- # entry. The pos1 and pos2 indicate what fields should be used to match
84
- # entries, the values for pos1 and pos2 can be an integer indicating the
85
- # position in the array or the symbol :main to refer to the key of the hash.
86
- # The options hash accepts the key :case_insensitive, which defaults to true.
87
- def self.merge(hash1, hash2, pos1 = :main, pos2 = :main, options = {})
88
- case_insensitive = options[:case_insensitive]; case_insensitive = true if case_insensitive.nil?
89
-
90
- raise "Key #{ pos1 } should be an Interger or :main" unless Fixnum === pos1 || pos1.to_s.downcase == 'main'
91
- raise "Key #{ pos2 } should be an Interger or :main" unless Fixnum === pos2 || pos2.to_s.downcase == 'main'
92
-
93
-
94
- # Pullout if pos2 is not :main
95
- hash2 = pullout(hash2, pos2) unless pos2.to_s.downcase == 'main'
96
-
97
- # Translate if pos1 is not :main
98
- if pos1.to_s.downcase != 'main'
99
- index = pullout(hash1, pos1, options.merge(:index => true))
100
- new = {}
101
- hash2.each do |key, list|
102
- next unless index[key]
103
- new[index[key]] = list
104
- end
105
- hash2 = new
106
- end
107
-
108
- # Get the lengths of the arrays on each hash (they should
109
- # be the same for every entry)
110
- length1 = hash1.values.first.length
111
- length2 = hash2.values.first.length
112
-
113
- if case_insensitive
114
- hash1 = make_case_insensitive hash1
115
- hash2 = make_case_insensitive hash2
116
- end
117
-
118
- new = {}
119
- (hash1.keys + hash2.keys).uniq.each do |key|
120
- if hash2[key].nil?
121
- list2 = [''] * length2
122
- else
123
- list2 = hash2[key]
124
- end
125
-
126
- if hash1[key].nil?
127
- list1 = [''] * length1
128
- else
129
- list1 = hash1[key]
130
- end
131
-
132
- new[key] = list1 + list2
133
- end
134
-
135
- new
136
- end
137
-
138
- # For a given hash of arrays, filter the position pos of each array with the
139
- # block of code.
140
- def self.process(hash, pos, &block)
141
- new = {}
142
- hash.each{|key, values|
143
- v = values
144
- v[pos] = v[pos].to_s.split("|").collect{|n| block.call(n)}.join("|")
145
- new[key] = v
146
- }
147
- new
148
- end
149
-
150
- # Clean structure for repeated values. If the same value appears two times
151
- # eliminate the one that appears latter on the values list (columns of the
152
- # ArrayHash are assumed to be sorted for importance) if the appear on the
153
- # same position, remove the one with the smaller vale of the code after
154
- # turning it into integer.
155
- def self.clean(hash, options = {})
156
- case_sensitive = options[:case_sensitive]
157
-
158
- found = {}
159
-
160
- hash.each{|k, list|
161
- list.each_with_index{|values,i|
162
- (String === values ? values.split("|") : values).each{|v|
163
- v = v.downcase if case_sensitive
164
- if found[v].nil?
165
- found[v] = [k,i]
166
- else
167
- last_k, last_i = found[v].values_at(0,1)
168
- if last_i > i || (last_i == i && last_k.to_i > k.to_i)
169
- found[v] = [k,i]
170
- end
171
- end
172
- }
173
- }
174
- }
175
-
176
- new_hash = {}
177
- hash.each{|k,list|
178
- new_list = []
179
- list.each_with_index{|values,i|
180
- new_values = []
181
- (String === values ? values.split("|") : values).each{|v|
182
- found_k, found_i = found[(case_sensitive ? v.downcase : v )].values_at(0,1)
183
- if found_i == i && found_k == k
184
- new_values << v
185
- end
186
- }
187
- new_list << (String === values ? new_values.join("|") : values)
188
- }
189
- new_hash[k] = new_list
190
- }
191
- new_hash
192
- end
193
-
194
- attr_reader :main, :fields, :data
195
- def initialize(hash, main, fields = nil)
196
- @data = hash
197
- @main = main.to_s
198
-
199
- if fields.nil? || fields.empty?
200
- l = hash.values.first.length
201
- fields = []
202
- l.times{|i| fields << "F#{i}"}
203
- end
204
-
205
- @fields = fields.collect{|f| f.to_s}
206
- end
207
-
208
- # Wrapper
209
- def process(field, &block)
210
- pos = self.field_pos(field)
211
- @data = ArrayHash.process(self.data, pos, &block)
212
- self
213
- end
214
-
215
- # Returns the position of a given field in the value arrays
216
- def field_pos(field)
217
- return :main if field == :main
218
- if field.to_s.downcase == self.main.to_s.downcase
219
- return :main
220
- else
221
- @fields.collect{|f| f.downcase }.index(field.to_s.downcase)
222
- end
223
- end
224
-
225
-
226
- # Merge two ArrayHashes using the specified field
227
- def merge(other, field = :main, options = {} )
228
- field = self.main if field == :main
229
-
230
- pos1 = self.field_pos(field)
231
- pos2 = other.field_pos(field)
232
-
233
- raise "Field #{ field } not found in target hash" if pos1.nil?
234
- raise "Field #{ field } not found in added hash" if pos2.nil?
235
-
236
- new = ArrayHash.merge(self.data, other.data, pos1, pos2, options)
237
- @data = new
238
- if pos2 == :main
239
- new_fields = other.fields
240
- else
241
- new_fields = other.fields
242
- new_fields.delete_at(pos2)
243
- new_fields.unshift(other.main)
244
- end
245
- @fields += new_fields
246
- self
247
- end
248
-
249
- # Remove a field from the ArrayHash
250
- def remove(field)
251
- pos = self.field_pos(field)
252
- return if pos.nil?
253
- @data = self.data.each{|key,values| values.delete_at(pos)}
254
- @fields.delete_at(pos)
255
- self
256
- end
257
-
258
- def clean
259
- @data = ArrayHash.clean(@data)
260
- self
261
- end
262
- end
263
-
264
-
265
-
266
-
@@ -1,72 +0,0 @@
1
- require 'fileutils'
2
- require 'rbbt'
3
-
4
- # Provides caching functionality for files downloaded from the internet
5
- module FileCache
6
-
7
- class BadPathError < StandardError; end
8
- class FileExistsError < StandardError; end
9
-
10
- private
11
-
12
- # Remove slash characters from filename.
13
- def self.clean_path(filename)
14
- filename.gsub(/\//,'_SLASH_')
15
- end
16
-
17
- # Check that the file name is safe and is in the correct format
18
- def self.sanity_check(filename)
19
- if filename =~ /\//
20
- raise FileCache::BadPathError, "Character / not allowed in name: #{ filename }"
21
- end
22
- if filename !~ /.+\..+/
23
- raise FileCache::BadPathError, "Filename '#{filename}' must have name and extension: name.ext"
24
- end
25
- end
26
-
27
- public
28
-
29
- # Find the path that a particular file would have in the cache
30
- def self.path(filename)
31
- sanity_check(filename)
32
-
33
- name, extension = filename.match(/(.+)\.(.+)/).values_at(1,2)
34
- dirs = name.scan(/./).reverse.values_at(0,1,2,3,4).reverse.compact.join('/')
35
-
36
- return File.join(File.join(Rbbt.cachedir,dirs),filename)
37
- end
38
-
39
- # Add a file in the cache. Raise exception if exists, unless force is
40
- # used.
41
- def self.add_file(filename, content, options = {})
42
- sanity_check(filename)
43
-
44
- path = path(filename)
45
- FileUtils.makedirs(File.dirname(path), :mode => 0777)
46
-
47
- if File.exist?(path) and ! (options[:force] || options['force'])
48
- raise FileCache::FileExistsError, "File #{filename} already in cache"
49
- end
50
-
51
- File.open(path,'w'){|f|
52
- f.write(content)
53
- }
54
- FileUtils.chmod 0666, path
55
-
56
- nil
57
- end
58
-
59
- # Removes the file from cache
60
- def self.del_file(filename)
61
- sanity_check(filename)
62
-
63
- path = path(filename)
64
-
65
- if File.exist? path
66
- FileUtils.rm path
67
- end
68
-
69
- nil
70
- end
71
-
72
- end
@@ -1,47 +0,0 @@
1
- require 'rbbt/util/open'
2
- require 'rbbt/util/arrayHash'
3
-
4
- module Index
5
-
6
- # Creates an inverse index. Takes a file with rows of elements
7
- # separated by a given pattern (specified by +sep+) and returns a hash
8
- # where each element points to the first element in the row. +lexicon+
9
- # is the file containing the data.
10
- def self.index(lexicon, options = {})
11
- options = {:sep => "\t", :sep2 => '\|', :case_sensitive => true}.merge(options)
12
-
13
-
14
- data = Open.to_hash(lexicon, options)
15
- if options[:clean]
16
- data = ArrayHash.clean(data)
17
- end
18
-
19
- index = {}
20
-
21
- data.each{|code, id_lists|
22
- next if code.nil? || code == ""
23
- id_lists.flatten.compact.uniq.each{|id|
24
- id = id.downcase unless options[:case_sensitive]
25
- index[id] = code
26
- }
27
- }
28
- data.each{|code, id_lists|
29
- next if code.nil? || code == ""
30
- id = code
31
- id = id.downcase unless options[:case_sensitive]
32
- index[id] = code
33
- }
34
-
35
- if !options[:case_sensitive]
36
- class << index; self; end.instance_eval{
37
- alias_method :old_get, :[]
38
- define_method(:[], proc{|key| old_get(key.to_s.downcase)})
39
-
40
- alias_method :old_values_at, :values_at
41
- define_method(:values_at, proc{|*keys| old_values_at(*keys.collect{|key| key.to_s.downcase }) })
42
- }
43
- end
44
-
45
- index
46
- end
47
- end
@@ -1,106 +0,0 @@
1
- require 'rbbt'
2
- require 'rbbt/util/open'
3
-
4
- class String
5
- CONSONANTS = []
6
- if File.exists? File.join(Rbbt.datadir, 'wordlists/consonants')
7
- Object::Open.read(File.join(Rbbt.datadir, 'wordlists/consonants')).each_line{|l| CONSONANTS << l.chomp}
8
- end
9
-
10
- # Uses heuristics to checks if a string seems like a special word, like a gene name.
11
- def is_special?
12
- # Only consonants
13
- return true if self =~ /^[bcdfghjklmnpqrstvwxz]+$/i
14
-
15
- # Not a word
16
- return false if self =~ /[^\s]\s[^\s]/;
17
- return false if self.length < 3;
18
- # Alphanumeric
19
- return true if self =~ /[0-9]/ && self =~ /[a-z]/i
20
- # All Caps
21
- return true if self =~ /[A-Z]{2,}/;
22
- # Caps Mix
23
- return true if self =~ /[a-z][A-Z]/;
24
- # All consonants
25
- return true if self =~ /^[a-z]$/i && self !~ /[aeiou]/i
26
- # Dashed word
27
- return true if self =~ /(^\w-|-\w$)/
28
- # To many consonants (very heuristic)
29
- if self =~ /([^aeiouy]{3,})/i && !CONSONANTS.include?($1.downcase)
30
- return true
31
- end
32
-
33
- return false
34
- end
35
-
36
- # Turns the first letter to lowercase
37
- def downcase_first
38
- return "" if self == ""
39
- letters = self.scan(/./)
40
- letters[0].downcase!
41
- letters.join("")
42
- end
43
-
44
- # Turns a roman number into arabic form is possible. Just simple
45
- # romans only...
46
- def arabic
47
- return 1 if self =~ /^I$/;
48
- return 2 if self =~ /^II$/;
49
- return 3 if self =~ /^III$/;
50
- return 4 if self =~ /^IV$/;
51
- return 5 if self =~ /^V$/;
52
- return 10 if self =~ /^X$/;
53
-
54
- return nil
55
- end
56
- end
57
-
58
-
59
-
60
-
61
- $greek = {
62
- "alpha" => "a",
63
- "beta" => "b",
64
- "gamma" => "g",
65
- "delta" => "d",
66
- "epsilon" => "e",
67
- "zeta" => "z",
68
- "eta" => "e",
69
- "theta" => "th",
70
- "iota" => "i",
71
- "kappa" => "k",
72
- "lambda" => "l",
73
- "mu" => "m",
74
- "nu" => "n",
75
- "xi" => "x",
76
- "omicron" => "o",
77
- "pi" => "p",
78
- "rho" => "r",
79
- "sigma" => "s",
80
- "tau" => "t",
81
- "upsilon" => "u",
82
- "phi" => "ph",
83
- "chi" => "ch",
84
- "psi" => "ps",
85
- "omega" => "o"
86
- }
87
-
88
- $inverse_greek = Hash.new
89
- $greek.each{|l,s| $inverse_greek[s] = l }
90
-
91
- $stopwords = Open.read(File.join(Rbbt.datadir, 'wordlists/stopwords')).scan(/\w+/) if File.exists? File.join(Rbbt.datadir, 'wordlists/stopwords')
92
-
93
- class Array
94
-
95
- # Divides the array into +num+ chunks of the same size by placing one
96
- # element in each chunk iteratively.
97
- def chunk(num)
98
- chunks = []
99
- each_with_index{|e, i|
100
- c = i % num
101
- chunks[c] ||=[]
102
- chunks[c] << e
103
- }
104
- chunks
105
- end
106
- end