rbbt 1.2.5 → 2.0.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (88) hide show
  1. checksums.yaml +7 -0
  2. data/README.rdoc +2 -138
  3. metadata +69 -214
  4. data/LICENSE +0 -20
  5. data/bin/rbbt_config +0 -245
  6. data/install_scripts/classifier/R/classify.R +0 -36
  7. data/install_scripts/classifier/Rakefile +0 -140
  8. data/install_scripts/get_abner.sh +0 -2
  9. data/install_scripts/get_banner.sh +0 -25
  10. data/install_scripts/get_biocreative.sh +0 -72
  11. data/install_scripts/get_crf++.sh +0 -26
  12. data/install_scripts/get_entrez.sh +0 -4
  13. data/install_scripts/get_go.sh +0 -4
  14. data/install_scripts/get_polysearch.sh +0 -8
  15. data/install_scripts/ner/Rakefile +0 -206
  16. data/install_scripts/ner/config/default.rb +0 -52
  17. data/install_scripts/norm/Rakefile +0 -219
  18. data/install_scripts/norm/config/cue_default.rb +0 -10
  19. data/install_scripts/norm/config/tokens_default.rb +0 -86
  20. data/install_scripts/norm/functions.sh +0 -23
  21. data/install_scripts/organisms/Ath.Rakefile +0 -55
  22. data/install_scripts/organisms/Cal.Rakefile +0 -84
  23. data/install_scripts/organisms/Cel.Rakefile +0 -109
  24. data/install_scripts/organisms/Hsa.Rakefile +0 -140
  25. data/install_scripts/organisms/Mmu.Rakefile +0 -77
  26. data/install_scripts/organisms/Rakefile +0 -43
  27. data/install_scripts/organisms/Rno.Rakefile +0 -88
  28. data/install_scripts/organisms/Sce.Rakefile +0 -66
  29. data/install_scripts/organisms/Spo.Rakefile +0 -40
  30. data/install_scripts/organisms/rake-include.rb +0 -252
  31. data/install_scripts/wordlists/consonants +0 -897
  32. data/install_scripts/wordlists/stopwords +0 -1
  33. data/lib/rbbt.rb +0 -83
  34. data/lib/rbbt/bow/bow.rb +0 -88
  35. data/lib/rbbt/bow/classifier.rb +0 -116
  36. data/lib/rbbt/bow/dictionary.rb +0 -187
  37. data/lib/rbbt/ner/abner.rb +0 -34
  38. data/lib/rbbt/ner/banner.rb +0 -73
  39. data/lib/rbbt/ner/dictionaryNER.rb +0 -98
  40. data/lib/rbbt/ner/regexpNER.rb +0 -70
  41. data/lib/rbbt/ner/rner.rb +0 -227
  42. data/lib/rbbt/ner/rnorm.rb +0 -143
  43. data/lib/rbbt/ner/rnorm/cue_index.rb +0 -80
  44. data/lib/rbbt/ner/rnorm/tokens.rb +0 -217
  45. data/lib/rbbt/sources/biocreative.rb +0 -75
  46. data/lib/rbbt/sources/biomart.rb +0 -105
  47. data/lib/rbbt/sources/entrez.rb +0 -211
  48. data/lib/rbbt/sources/go.rb +0 -85
  49. data/lib/rbbt/sources/gscholar.rb +0 -74
  50. data/lib/rbbt/sources/organism.rb +0 -241
  51. data/lib/rbbt/sources/polysearch.rb +0 -117
  52. data/lib/rbbt/sources/pubmed.rb +0 -248
  53. data/lib/rbbt/util/arrayHash.rb +0 -266
  54. data/lib/rbbt/util/filecache.rb +0 -72
  55. data/lib/rbbt/util/index.rb +0 -47
  56. data/lib/rbbt/util/misc.rb +0 -106
  57. data/lib/rbbt/util/open.rb +0 -251
  58. data/lib/rbbt/util/rake.rb +0 -183
  59. data/lib/rbbt/util/simpleDSL.rb +0 -87
  60. data/lib/rbbt/util/tmpfile.rb +0 -35
  61. data/tasks/install.rake +0 -124
  62. data/test/rbbt/bow/test_bow.rb +0 -33
  63. data/test/rbbt/bow/test_classifier.rb +0 -72
  64. data/test/rbbt/bow/test_dictionary.rb +0 -91
  65. data/test/rbbt/ner/rnorm/test_cue_index.rb +0 -57
  66. data/test/rbbt/ner/rnorm/test_tokens.rb +0 -70
  67. data/test/rbbt/ner/test_abner.rb +0 -17
  68. data/test/rbbt/ner/test_banner.rb +0 -17
  69. data/test/rbbt/ner/test_dictionaryNER.rb +0 -122
  70. data/test/rbbt/ner/test_regexpNER.rb +0 -33
  71. data/test/rbbt/ner/test_rner.rb +0 -126
  72. data/test/rbbt/ner/test_rnorm.rb +0 -47
  73. data/test/rbbt/sources/test_biocreative.rb +0 -38
  74. data/test/rbbt/sources/test_biomart.rb +0 -31
  75. data/test/rbbt/sources/test_entrez.rb +0 -49
  76. data/test/rbbt/sources/test_go.rb +0 -24
  77. data/test/rbbt/sources/test_organism.rb +0 -59
  78. data/test/rbbt/sources/test_polysearch.rb +0 -27
  79. data/test/rbbt/sources/test_pubmed.rb +0 -39
  80. data/test/rbbt/util/test_arrayHash.rb +0 -257
  81. data/test/rbbt/util/test_filecache.rb +0 -37
  82. data/test/rbbt/util/test_index.rb +0 -31
  83. data/test/rbbt/util/test_misc.rb +0 -20
  84. data/test/rbbt/util/test_open.rb +0 -110
  85. data/test/rbbt/util/test_simpleDSL.rb +0 -57
  86. data/test/rbbt/util/test_tmpfile.rb +0 -21
  87. data/test/test_helper.rb +0 -4
  88. data/test/test_rbbt.rb +0 -11
@@ -1,266 +0,0 @@
1
-
2
- class ArrayHash
3
-
4
- def self.make_case_insensitive(hash)
5
- new = {}
6
- hash.each{|k,v|
7
- new[k.to_s.downcase] = v
8
- }
9
-
10
- class << new; self; end.instance_eval{
11
- alias_method :old_get, :[]
12
- define_method(:[], proc{|key| old_get(key.to_s.downcase)})
13
- }
14
-
15
- new
16
- end
17
-
18
- # Take two strings of elements separated by the character sep_char and join them
19
- # into one, removing repetitions.
20
- def self.merge_values_string(list1, list2, sep_char ='|')
21
- elem1 = list1.to_s.split(sep_char)
22
- elem2 = list2.to_s.split(sep_char)
23
- (elem1 + elem2).select{|e| e.to_s != ""}.uniq.join(sep_char)
24
- end
25
-
26
- # Merge two lists of elements. Elements could be strings of elements
27
- # separated by the character sep_char, or arrays of lists of such strings.
28
- def self.merge_values(list1, list2, sep_char = "|")
29
- if String === list1 || String === list2
30
- return merge_values_string(list1, list2)
31
- end
32
-
33
- if list1.nil?
34
- list1 = [''] * list2.length
35
- end
36
-
37
- if list2.nil?
38
- list2 = [''] * list1.length
39
- end
40
-
41
- new = []
42
- list1.each_with_index{|elem, i|
43
- new << merge_values_string(elem, list2[i], sep_char)
44
- }
45
- new
46
- end
47
-
48
-
49
- # Take an hash of arrays and a position and use the value at that position
50
- # of the arrays to build a new hash with that value as key, and the original
51
- # key prepended to the arrays. The options hash accepts the following keys
52
- # :case_insensitive, which defaults to true, and :index, which indicates that
53
- # the original key should be the value of the hash entry, instead of the
54
- # complete array of values.
55
- def self.pullout(hash, pos, options = {})
56
- index = options[:index]; index = false if index.nil?
57
- case_insensitive = options[:case_insensitive]; case_insensitive = true if case_insensitive.nil?
58
-
59
- new = {}
60
- hash.each{|key,values|
61
- code = values[pos].to_s
62
- next if code == ""
63
-
64
- if index
65
- list = key
66
- else
67
- list = [key] + values
68
- list.delete_at(pos + 1)
69
- end
70
-
71
- code.split("|").each{|c|
72
- c = c.downcase if case_insensitive
73
- new[c] = merge_values(new[c], list)
74
- }
75
- }
76
-
77
- new = make_case_insensitive new if case_insensitive
78
-
79
- new
80
- end
81
-
82
- # Merge one hash of arrays into another. Each hash contains a number of fields for each
83
- # entry. The pos1 and pos2 indicate what fields should be used to match
84
- # entries, the values for pos1 and pos2 can be an integer indicating the
85
- # position in the array or the symbol :main to refer to the key of the hash.
86
- # The options hash accepts the key :case_insensitive, which defaults to true.
87
- def self.merge(hash1, hash2, pos1 = :main, pos2 = :main, options = {})
88
- case_insensitive = options[:case_insensitive]; case_insensitive = true if case_insensitive.nil?
89
-
90
- raise "Key #{ pos1 } should be an Interger or :main" unless Fixnum === pos1 || pos1.to_s.downcase == 'main'
91
- raise "Key #{ pos2 } should be an Interger or :main" unless Fixnum === pos2 || pos2.to_s.downcase == 'main'
92
-
93
-
94
- # Pullout if pos2 is not :main
95
- hash2 = pullout(hash2, pos2) unless pos2.to_s.downcase == 'main'
96
-
97
- # Translate if pos1 is not :main
98
- if pos1.to_s.downcase != 'main'
99
- index = pullout(hash1, pos1, options.merge(:index => true))
100
- new = {}
101
- hash2.each do |key, list|
102
- next unless index[key]
103
- new[index[key]] = list
104
- end
105
- hash2 = new
106
- end
107
-
108
- # Get the lengths of the arrays on each hash (they should
109
- # be the same for every entry)
110
- length1 = hash1.values.first.length
111
- length2 = hash2.values.first.length
112
-
113
- if case_insensitive
114
- hash1 = make_case_insensitive hash1
115
- hash2 = make_case_insensitive hash2
116
- end
117
-
118
- new = {}
119
- (hash1.keys + hash2.keys).uniq.each do |key|
120
- if hash2[key].nil?
121
- list2 = [''] * length2
122
- else
123
- list2 = hash2[key]
124
- end
125
-
126
- if hash1[key].nil?
127
- list1 = [''] * length1
128
- else
129
- list1 = hash1[key]
130
- end
131
-
132
- new[key] = list1 + list2
133
- end
134
-
135
- new
136
- end
137
-
138
- # For a given hash of arrays, filter the position pos of each array with the
139
- # block of code.
140
- def self.process(hash, pos, &block)
141
- new = {}
142
- hash.each{|key, values|
143
- v = values
144
- v[pos] = v[pos].to_s.split("|").collect{|n| block.call(n)}.join("|")
145
- new[key] = v
146
- }
147
- new
148
- end
149
-
150
- # Clean structure for repeated values. If the same value appears two times
151
- # eliminate the one that appears latter on the values list (columns of the
152
- # ArrayHash are assumed to be sorted for importance) if the appear on the
153
- # same position, remove the one with the smaller vale of the code after
154
- # turning it into integer.
155
- def self.clean(hash, options = {})
156
- case_sensitive = options[:case_sensitive]
157
-
158
- found = {}
159
-
160
- hash.each{|k, list|
161
- list.each_with_index{|values,i|
162
- (String === values ? values.split("|") : values).each{|v|
163
- v = v.downcase if case_sensitive
164
- if found[v].nil?
165
- found[v] = [k,i]
166
- else
167
- last_k, last_i = found[v].values_at(0,1)
168
- if last_i > i || (last_i == i && last_k.to_i > k.to_i)
169
- found[v] = [k,i]
170
- end
171
- end
172
- }
173
- }
174
- }
175
-
176
- new_hash = {}
177
- hash.each{|k,list|
178
- new_list = []
179
- list.each_with_index{|values,i|
180
- new_values = []
181
- (String === values ? values.split("|") : values).each{|v|
182
- found_k, found_i = found[(case_sensitive ? v.downcase : v )].values_at(0,1)
183
- if found_i == i && found_k == k
184
- new_values << v
185
- end
186
- }
187
- new_list << (String === values ? new_values.join("|") : values)
188
- }
189
- new_hash[k] = new_list
190
- }
191
- new_hash
192
- end
193
-
194
- attr_reader :main, :fields, :data
195
- def initialize(hash, main, fields = nil)
196
- @data = hash
197
- @main = main.to_s
198
-
199
- if fields.nil? || fields.empty?
200
- l = hash.values.first.length
201
- fields = []
202
- l.times{|i| fields << "F#{i}"}
203
- end
204
-
205
- @fields = fields.collect{|f| f.to_s}
206
- end
207
-
208
- # Wrapper
209
- def process(field, &block)
210
- pos = self.field_pos(field)
211
- @data = ArrayHash.process(self.data, pos, &block)
212
- self
213
- end
214
-
215
- # Returns the position of a given field in the value arrays
216
- def field_pos(field)
217
- return :main if field == :main
218
- if field.to_s.downcase == self.main.to_s.downcase
219
- return :main
220
- else
221
- @fields.collect{|f| f.downcase }.index(field.to_s.downcase)
222
- end
223
- end
224
-
225
-
226
- # Merge two ArrayHashes using the specified field
227
- def merge(other, field = :main, options = {} )
228
- field = self.main if field == :main
229
-
230
- pos1 = self.field_pos(field)
231
- pos2 = other.field_pos(field)
232
-
233
- raise "Field #{ field } not found in target hash" if pos1.nil?
234
- raise "Field #{ field } not found in added hash" if pos2.nil?
235
-
236
- new = ArrayHash.merge(self.data, other.data, pos1, pos2, options)
237
- @data = new
238
- if pos2 == :main
239
- new_fields = other.fields
240
- else
241
- new_fields = other.fields
242
- new_fields.delete_at(pos2)
243
- new_fields.unshift(other.main)
244
- end
245
- @fields += new_fields
246
- self
247
- end
248
-
249
- # Remove a field from the ArrayHash
250
- def remove(field)
251
- pos = self.field_pos(field)
252
- return if pos.nil?
253
- @data = self.data.each{|key,values| values.delete_at(pos)}
254
- @fields.delete_at(pos)
255
- self
256
- end
257
-
258
- def clean
259
- @data = ArrayHash.clean(@data)
260
- self
261
- end
262
- end
263
-
264
-
265
-
266
-
@@ -1,72 +0,0 @@
1
- require 'fileutils'
2
- require 'rbbt'
3
-
4
- # Provides caching functionality for files downloaded from the internet
5
- module FileCache
6
-
7
- class BadPathError < StandardError; end
8
- class FileExistsError < StandardError; end
9
-
10
- private
11
-
12
- # Remove slash characters from filename.
13
- def self.clean_path(filename)
14
- filename.gsub(/\//,'_SLASH_')
15
- end
16
-
17
- # Check that the file name is safe and is in the correct format
18
- def self.sanity_check(filename)
19
- if filename =~ /\//
20
- raise FileCache::BadPathError, "Character / not allowed in name: #{ filename }"
21
- end
22
- if filename !~ /.+\..+/
23
- raise FileCache::BadPathError, "Filename '#{filename}' must have name and extension: name.ext"
24
- end
25
- end
26
-
27
- public
28
-
29
- # Find the path that a particular file would have in the cache
30
- def self.path(filename)
31
- sanity_check(filename)
32
-
33
- name, extension = filename.match(/(.+)\.(.+)/).values_at(1,2)
34
- dirs = name.scan(/./).reverse.values_at(0,1,2,3,4).reverse.compact.join('/')
35
-
36
- return File.join(File.join(Rbbt.cachedir,dirs),filename)
37
- end
38
-
39
- # Add a file in the cache. Raise exception if exists, unless force is
40
- # used.
41
- def self.add_file(filename, content, options = {})
42
- sanity_check(filename)
43
-
44
- path = path(filename)
45
- FileUtils.makedirs(File.dirname(path), :mode => 0777)
46
-
47
- if File.exist?(path) and ! (options[:force] || options['force'])
48
- raise FileCache::FileExistsError, "File #{filename} already in cache"
49
- end
50
-
51
- File.open(path,'w'){|f|
52
- f.write(content)
53
- }
54
- FileUtils.chmod 0666, path
55
-
56
- nil
57
- end
58
-
59
- # Removes the file from cache
60
- def self.del_file(filename)
61
- sanity_check(filename)
62
-
63
- path = path(filename)
64
-
65
- if File.exist? path
66
- FileUtils.rm path
67
- end
68
-
69
- nil
70
- end
71
-
72
- end
@@ -1,47 +0,0 @@
1
- require 'rbbt/util/open'
2
- require 'rbbt/util/arrayHash'
3
-
4
- module Index
5
-
6
- # Creates an inverse index. Takes a file with rows of elements
7
- # separated by a given pattern (specified by +sep+) and returns a hash
8
- # where each element points to the first element in the row. +lexicon+
9
- # is the file containing the data.
10
- def self.index(lexicon, options = {})
11
- options = {:sep => "\t", :sep2 => '\|', :case_sensitive => true}.merge(options)
12
-
13
-
14
- data = Open.to_hash(lexicon, options)
15
- if options[:clean]
16
- data = ArrayHash.clean(data)
17
- end
18
-
19
- index = {}
20
-
21
- data.each{|code, id_lists|
22
- next if code.nil? || code == ""
23
- id_lists.flatten.compact.uniq.each{|id|
24
- id = id.downcase unless options[:case_sensitive]
25
- index[id] = code
26
- }
27
- }
28
- data.each{|code, id_lists|
29
- next if code.nil? || code == ""
30
- id = code
31
- id = id.downcase unless options[:case_sensitive]
32
- index[id] = code
33
- }
34
-
35
- if !options[:case_sensitive]
36
- class << index; self; end.instance_eval{
37
- alias_method :old_get, :[]
38
- define_method(:[], proc{|key| old_get(key.to_s.downcase)})
39
-
40
- alias_method :old_values_at, :values_at
41
- define_method(:values_at, proc{|*keys| old_values_at(*keys.collect{|key| key.to_s.downcase }) })
42
- }
43
- end
44
-
45
- index
46
- end
47
- end
@@ -1,106 +0,0 @@
1
- require 'rbbt'
2
- require 'rbbt/util/open'
3
-
4
- class String
5
- CONSONANTS = []
6
- if File.exists? File.join(Rbbt.datadir, 'wordlists/consonants')
7
- Object::Open.read(File.join(Rbbt.datadir, 'wordlists/consonants')).each_line{|l| CONSONANTS << l.chomp}
8
- end
9
-
10
- # Uses heuristics to checks if a string seems like a special word, like a gene name.
11
- def is_special?
12
- # Only consonants
13
- return true if self =~ /^[bcdfghjklmnpqrstvwxz]+$/i
14
-
15
- # Not a word
16
- return false if self =~ /[^\s]\s[^\s]/;
17
- return false if self.length < 3;
18
- # Alphanumeric
19
- return true if self =~ /[0-9]/ && self =~ /[a-z]/i
20
- # All Caps
21
- return true if self =~ /[A-Z]{2,}/;
22
- # Caps Mix
23
- return true if self =~ /[a-z][A-Z]/;
24
- # All consonants
25
- return true if self =~ /^[a-z]$/i && self !~ /[aeiou]/i
26
- # Dashed word
27
- return true if self =~ /(^\w-|-\w$)/
28
- # To many consonants (very heuristic)
29
- if self =~ /([^aeiouy]{3,})/i && !CONSONANTS.include?($1.downcase)
30
- return true
31
- end
32
-
33
- return false
34
- end
35
-
36
- # Turns the first letter to lowercase
37
- def downcase_first
38
- return "" if self == ""
39
- letters = self.scan(/./)
40
- letters[0].downcase!
41
- letters.join("")
42
- end
43
-
44
- # Turns a roman number into arabic form is possible. Just simple
45
- # romans only...
46
- def arabic
47
- return 1 if self =~ /^I$/;
48
- return 2 if self =~ /^II$/;
49
- return 3 if self =~ /^III$/;
50
- return 4 if self =~ /^IV$/;
51
- return 5 if self =~ /^V$/;
52
- return 10 if self =~ /^X$/;
53
-
54
- return nil
55
- end
56
- end
57
-
58
-
59
-
60
-
61
- $greek = {
62
- "alpha" => "a",
63
- "beta" => "b",
64
- "gamma" => "g",
65
- "delta" => "d",
66
- "epsilon" => "e",
67
- "zeta" => "z",
68
- "eta" => "e",
69
- "theta" => "th",
70
- "iota" => "i",
71
- "kappa" => "k",
72
- "lambda" => "l",
73
- "mu" => "m",
74
- "nu" => "n",
75
- "xi" => "x",
76
- "omicron" => "o",
77
- "pi" => "p",
78
- "rho" => "r",
79
- "sigma" => "s",
80
- "tau" => "t",
81
- "upsilon" => "u",
82
- "phi" => "ph",
83
- "chi" => "ch",
84
- "psi" => "ps",
85
- "omega" => "o"
86
- }
87
-
88
- $inverse_greek = Hash.new
89
- $greek.each{|l,s| $inverse_greek[s] = l }
90
-
91
- $stopwords = Open.read(File.join(Rbbt.datadir, 'wordlists/stopwords')).scan(/\w+/) if File.exists? File.join(Rbbt.datadir, 'wordlists/stopwords')
92
-
93
- class Array
94
-
95
- # Divides the array into +num+ chunks of the same size by placing one
96
- # element in each chunk iteratively.
97
- def chunk(num)
98
- chunks = []
99
- each_with_index{|e, i|
100
- c = i % num
101
- chunks[c] ||=[]
102
- chunks[c] << e
103
- }
104
- chunks
105
- end
106
- end