rbbt 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (59) hide show
  1. data/LICENSE +20 -0
  2. data/README.rdoc +17 -0
  3. data/bin/rbbt_config +180 -0
  4. data/install_scripts/classifier/R/classify.R +36 -0
  5. data/install_scripts/classifier/Rakefile +140 -0
  6. data/install_scripts/get_abner.sh +2 -0
  7. data/install_scripts/get_banner.sh +25 -0
  8. data/install_scripts/get_biocreative.sh +72 -0
  9. data/install_scripts/get_crf++.sh +26 -0
  10. data/install_scripts/get_entrez.sh +4 -0
  11. data/install_scripts/get_go.sh +4 -0
  12. data/install_scripts/get_polysearch.sh +8 -0
  13. data/install_scripts/ner/Rakefile +206 -0
  14. data/install_scripts/ner/config/default.rb +52 -0
  15. data/install_scripts/norm/Rakefile +218 -0
  16. data/install_scripts/norm/config/cue_default.rb +10 -0
  17. data/install_scripts/norm/config/tokens_default.rb +79 -0
  18. data/install_scripts/norm/functions.sh +21 -0
  19. data/install_scripts/organisms/Rakefile +25 -0
  20. data/install_scripts/organisms/cgd.Rakefile +84 -0
  21. data/install_scripts/organisms/human.Rakefile +145 -0
  22. data/install_scripts/organisms/mgi.Rakefile +77 -0
  23. data/install_scripts/organisms/pombe.Rakefile +40 -0
  24. data/install_scripts/organisms/rake-include.rb +258 -0
  25. data/install_scripts/organisms/rgd.Rakefile +88 -0
  26. data/install_scripts/organisms/sgd.Rakefile +66 -0
  27. data/install_scripts/organisms/tair.Rakefile +54 -0
  28. data/install_scripts/organisms/worm.Rakefile +109 -0
  29. data/install_scripts/stopwords +1 -0
  30. data/install_scripts/wordlists/consonants +897 -0
  31. data/install_scripts/wordlists/stopwords +1 -0
  32. data/lib/rbbt/bow/bow.rb +87 -0
  33. data/lib/rbbt/bow/classifier.rb +118 -0
  34. data/lib/rbbt/bow/dictionary.rb +218 -0
  35. data/lib/rbbt/ner/abner.rb +34 -0
  36. data/lib/rbbt/ner/banner.rb +73 -0
  37. data/lib/rbbt/ner/regexpNER.rb +62 -0
  38. data/lib/rbbt/ner/rner.rb +227 -0
  39. data/lib/rbbt/ner/rnorm/cue_index.rb +80 -0
  40. data/lib/rbbt/ner/rnorm/tokens.rb +213 -0
  41. data/lib/rbbt/ner/rnorm.rb +142 -0
  42. data/lib/rbbt/sources/biocreative.rb +75 -0
  43. data/lib/rbbt/sources/biomart.rb +106 -0
  44. data/lib/rbbt/sources/entrez.rb +211 -0
  45. data/lib/rbbt/sources/go.rb +40 -0
  46. data/lib/rbbt/sources/organism.rb +197 -0
  47. data/lib/rbbt/sources/polysearch.rb +88 -0
  48. data/lib/rbbt/sources/pubmed.rb +111 -0
  49. data/lib/rbbt/util/arrayHash.rb +255 -0
  50. data/lib/rbbt/util/filecache.rb +72 -0
  51. data/lib/rbbt/util/index.rb +69 -0
  52. data/lib/rbbt/util/misc.rb +101 -0
  53. data/lib/rbbt/util/open.rb +207 -0
  54. data/lib/rbbt/util/simpleDSL.rb +87 -0
  55. data/lib/rbbt/util/tmpfile.rb +19 -0
  56. data/lib/rbbt/version.rb +10 -0
  57. data/lib/rbbt.rb +86 -0
  58. data/tasks/install.rake +123 -0
  59. metadata +114 -0
@@ -0,0 +1,88 @@
1
+ require 'rbbt'
2
+ require 'rbbt/util/open'
3
+ require 'rbbt/ner/regexpNER'
4
+
5
+ # Find terms in the Polysearch thesauri using simple regular expression
6
+ # matching. Note that the first time the methods are used the correspondent
7
+ # thesaurus are loaded into memory. The available thesauri are: disease, drug,
8
+ # metabolite, organ, subcellular (subcellular localization) and tissue.
9
+ module Polysearch
10
+
11
+
12
+ @@names = {}
13
+ def self.type_names(type) #:nodoc:
14
+ @@names[type] ||= Open.to_hash(File.join(Rbbt.datadir,'dbs','polysearch',type + '.txt'), :single => true)
15
+ end
16
+
17
+
18
+ @@indexes = {}
19
+ def self.type_index(type) #:nodoc:
20
+ @@indexes[type] ||= RegExpNER.new(File.join(Rbbt.datadir,'dbs','polysearch',type + '.txt'))
21
+ end
22
+
23
+ # Find matches in a string of text, the types array specifies which thesauri
24
+ # to use, if if nil it will use all.
25
+ def self.match(text, types = nil)
26
+ if types.nil?
27
+ types = %w(disease drug metabolite organ subcellular tissue)
28
+ end
29
+
30
+ types = [types] unless Array === types
31
+ types = types.sort
32
+
33
+ matches = {}
34
+ types.collect{|type|
35
+ matches.merge!(type_index(type).match_hash(text))
36
+ }
37
+
38
+ matches
39
+ end
40
+
41
+ # Transform the code into a name, type is the thesaurus to use
42
+ def self.name(type, code)
43
+ type_names(type)[code]
44
+ end
45
+
46
+ end
47
+
48
+ if __FILE__ == $0
49
+
50
+ text =<<-EOT
51
+
52
+ Background Microorganisms adapt their transcriptome by integrating
53
+ multiple chemical and physical signals from their environment. Shake-flask
54
+ cultivation does not allow precise manipulation of individual culture
55
+ parameters and therefore precludes a quantitative analysis of the
56
+ (combinatorial) influence of these parameters on transcriptional
57
+ regulation. Steady-state chemostat cultures, which do enable accurate
58
+ control, measurement and manipulation of individual cultivation parameters
59
+ (e.g. specific growth rate, temperature, identity of the growth-limiting
60
+ nutrient) appear to provide a promising experimental platform for such a
61
+ combinatorial analysis. Results A microarray compendium of 170
62
+ steady-state chemostat cultures of the yeast Saccharomyces cerevisiae is
63
+ presented and analyzed. The 170 microarrays encompass 55 unique
64
+ conditions, which can be characterized by the combined settings of 10
65
+ different cultivation parameters. By applying a regression model to assess
66
+ the impact of (combinations of) cultivation parameters on the
67
+ transcriptome, most S. cerevisiae genes were shown to be influenced by
68
+ multiple cultivation parameters, and in many cases by combinatorial
69
+ effects of cultivation parameters. The inclusion of these combinatorial
70
+ effects in the regression model led to higher explained variance of the
71
+ gene expression patterns and resulted in higher function enrichment in
72
+ subsequent analysis. We further demonstrate the usefulness of the
73
+ compendium and regression analysis for interpretation of shake-flask-based
74
+ transcriptome studies and for guiding functional analysis of
75
+ (uncharacterized) genes and pathways. Conclusions Modeling the
76
+ combinatorial effects of environmental parameters on the transcriptome is
77
+ crucial for understanding transcriptional regulation. Chemostat
78
+ cultivation offers a powerful tool for such an approach. Keywords:
79
+ chemostat steady state samples
80
+ Cerebellar
81
+ stroke syndrome
82
+
83
+
84
+ EOT
85
+
86
+ p Polysearch.match(text,'disease').values.flatten
87
+
88
+ end
@@ -0,0 +1,111 @@
1
+ require 'rbbt/util/filecache'
2
+ require 'rbbt/util/open'
3
+ require 'rbbt'
4
+
5
+ # This module offers an interface with PubMed, to perform queries, and
6
+ # retrieve simple information from articles. It uses the caching
7
+ # services of Rbbt.
8
+ module PubMed
9
+
10
+ private
11
+ @@last = Time.now
12
+ @@pubmed_lag = 1
13
+ def self.get_online(pmids)
14
+
15
+ pmid_list = ( pmids.is_a?(Array) ? pmids.join(',') : pmids.to_s )
16
+ url = "http://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=pubmed&retmode=xml&id=#{pmid_list}"
17
+
18
+ diff = Time.now - @@last
19
+ sleep @@pubmed_lag - diff unless diff > @@pubmed_lag
20
+
21
+ xml = Open.read(url, :quiet => true, :nocache => true)
22
+
23
+ @@last = Time.now
24
+
25
+ articles = xml.scan(/(<PubmedArticle>.*?<\/PubmedArticle>)/sm).flatten
26
+
27
+ if pmids.is_a? Array
28
+ list = {}
29
+ articles.each{|article|
30
+ pmid = article.scan(/<PMID>(.*?)<\/PMID>/).flatten.first
31
+ list[pmid] = article
32
+ }
33
+ return list
34
+ else
35
+ return articles.first
36
+ end
37
+
38
+ end
39
+
40
+ public
41
+
42
+ # Processes the xml with an articles as served by MedLine and extracts
43
+ # the abstract, title and journal information
44
+ class Article
45
+ attr_reader :title, :abstract, :journal
46
+ def initialize(xml)
47
+ xml ||= ""
48
+ @abstract = $1 if xml.match(/<AbstractText>(.*)<\/AbstractText>/sm)
49
+ @title = $1 if xml.match(/<ArticleTitle>(.*)<\/ArticleTitle>/sm)
50
+ @journal = $1 if xml.match(/<Title>(.*)<\/Title>/sm)
51
+ end
52
+
53
+ # Join the text from title and abstract
54
+ def text
55
+ [@title, @abstract].join("\n")
56
+ end
57
+ end
58
+
59
+ # Returns the Article object containing the information for the PubMed
60
+ # ID specified as an argument. If +pmid+ is an array instead of a single
61
+ # identifier it returns an hash with the Article object for each id.
62
+ # It uses the Rbbt cache to save the articles xml.
63
+ def self.get_article(pmid)
64
+
65
+ if pmid.is_a? Array
66
+ missing = []
67
+ list = {}
68
+
69
+ pmid.each{|p|
70
+ filename = p.to_s + '.xml'
71
+ if File.exists? FileCache.path(filename)
72
+ list[p] = Article.new(Open.read(FileCache.path(filename)))
73
+ else
74
+ missing << p
75
+ end
76
+ }
77
+
78
+ return list unless missing.any?
79
+ articles = get_online(missing)
80
+
81
+ articles.each{|p, xml|
82
+ filename = p + '.xml'
83
+ FileCache.add_file(filename,xml, :force => true)
84
+ list[p] = Article.new(xml)
85
+ }
86
+
87
+ return list
88
+
89
+ else
90
+ filename = pmid.to_s + '.xml'
91
+
92
+ if File.exists? FileCache.path(filename)
93
+ return Article.new(Open.read(FileCache.path(filename)))
94
+ else
95
+ xml = get_online(pmid)
96
+ FileCache.add_file(filename,xml)
97
+
98
+ return Article.new(xml)
99
+ end
100
+ end
101
+ end
102
+
103
+ # Performs the specified query and returns an array with the PubMed
104
+ # Ids returned. +retmax+ can be used to limit the number of ids
105
+ # returned, if is not specified 30000 is used.
106
+ def self.query(query, retmax=nil)
107
+ retmax ||= 30000
108
+
109
+ Open.read("http://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?retmax=#{retmax}&db=pubmed&term=#{query}",:quiet => true, :nocache => true).scan(/<Id>(\d+)<\/Id>/).flatten
110
+ end
111
+ end
@@ -0,0 +1,255 @@
1
+
2
+ class ArrayHash
3
+
4
+ # Take two strings of elements separated by the character sep_char and join them
5
+ # into one, removing repetitions.
6
+ def self.merge_values_string(list1, list2, sep_char ='|')
7
+ elem1 = list1.to_s.split(sep_char)
8
+ elem2 = list2.to_s.split(sep_char)
9
+ (elem1 + elem2).select{|e| e.to_s != ""}.uniq.join(sep_char)
10
+ end
11
+
12
+ # Merge two lists of elements. Elements could be strings of elements
13
+ # separated by the character sep_char, or arrays of lists of such strings.
14
+ def self.merge_values(list1, list2, sep_char = "|")
15
+ if String === list1 || String === list2
16
+ return merge_values_string(list1, list2)
17
+ end
18
+
19
+ if list1.nil?
20
+ list1 = [''] * list2.length
21
+ end
22
+
23
+ if list2.nil?
24
+ list2 = [''] * list1.length
25
+ end
26
+
27
+ new = []
28
+ list1.each_with_index{|elem, i|
29
+ new << merge_values_string(elem, list2[i], sep_char)
30
+ }
31
+ new
32
+ end
33
+
34
+
35
+ # Take an hash of arrays and a position and use the value at that position
36
+ # of the arrays and build a new hash with that value as key, and the original
37
+ # key prepended to the arrays. The options hash appcepts the following keys
38
+ # :case_insensitive, which defaults to true, and :index, which indicates that
39
+ # the original key should be the value of the hash entry, instead of the
40
+ # complete array of values.
41
+ def self.pullout(hash, pos, options = {})
42
+ index = options[:index]; index = false if index.nil?
43
+ case_insensitive = options[:case_insensitive]; case_insensitive = true if case_insensitive.nil?
44
+
45
+ new = {}
46
+ hash.each{|key,values|
47
+ code = values[pos].to_s
48
+ next if code == ""
49
+
50
+ if index
51
+ list = key
52
+ else
53
+ list = [key] + values
54
+ list.delete_at(pos + 1)
55
+ end
56
+
57
+ code.split("|").each{|c|
58
+ c = c.downcase if case_insensitive
59
+ new[c] = merge_values(new[c], list)
60
+ }
61
+ }
62
+
63
+ if case_insensitive
64
+ class << new; self; end.instance_eval{
65
+ alias_method :old_get, :[]
66
+ define_method(:[], proc{|key| old_get(key.to_s.downcase)})
67
+ }
68
+ end
69
+
70
+ new
71
+ end
72
+
73
+ # Merge to hashes of arrays. Each hash contains a number of fields for each
74
+ # entry. The pos1 and pos2 indicate what fields should be used to match
75
+ # entries, the values for pos1 and pos2 can be an integer indicating the
76
+ # position in the array or the symbol :main to refer to the key of the hash.
77
+ # The options hash accepts the key :case_insensitive, which defaults to true.
78
+ def self.merge(hash1, hash2, pos1 = :main, pos2 = :main, options = {})
79
+
80
+ case_insensitive = options[:case_insensitive]; case_insensitive = true if case_insensitive.nil?
81
+ if pos1.to_s.downcase != 'main'
82
+ index1 = pullout(hash1, pos1, options.merge(:index => true))
83
+ elsif options[:case_insensitive]
84
+ new = {}
85
+ hash1.each{|k,v|
86
+ new[k.to_s.downcase] = v
87
+ }
88
+ class << new; self; end.instance_eval{
89
+ alias_method :old_get, :[]
90
+ define_method(:[], proc{|key| old_get(key.to_s.downcase)})
91
+ }
92
+ hash1 = new
93
+ end
94
+
95
+ length1 = hash1.values.first.length
96
+ length2 = hash2.values.first.length
97
+
98
+ new = {}
99
+ hash2.each{|key, values|
100
+ case
101
+ when pos2.to_s.downcase == 'main'
102
+ k = key
103
+ v = values
104
+ when Fixnum === pos2
105
+ k = values[pos2]
106
+ v = values
107
+ v.delete_at(pos2)
108
+ v.unshift(key)
109
+ else
110
+ raise "Format of second index not understood"
111
+ end
112
+
113
+ code = (index1.nil? ? k : index1[k])
114
+ if code
115
+ code.split('|').each{|c|
116
+ c = c.to_s.downcase if options[:case_insensitive]
117
+ new[c] = hash1[c] || [''] * length1
118
+ new[c] += v
119
+ }
120
+ end
121
+ }
122
+
123
+ hash1.each{|key, values|
124
+ new[key] ||= values + [''] * length2
125
+ }
126
+
127
+ new
128
+ end
129
+
130
+ # For a given hash of arrays, filter the position pos of each array with the
131
+ # block of code.
132
+ def self.process(hash, pos, &block)
133
+ new = {}
134
+ hash.each{|key, values|
135
+ v = values
136
+ v[pos] = v[pos].to_s.split("|").collect{|n| block.call(n)}.join("|")
137
+ new[key] = v
138
+ }
139
+ new
140
+ end
141
+
142
+ # Clean structure for repeated values. If the same value apear two times use
143
+ # eliminate the one that appears latter on the values list (columns of the
144
+ # ArrayHash are assumed to be sorted for importance) if the appear on the
145
+ # same position, remove the one with the smaller vale of the code after
146
+ # turning it into integer.
147
+ def self.clean(hash, options = {})
148
+ case_sensitive = options[:case_sensitive]
149
+
150
+ found = {}
151
+
152
+ hash.each{|k, list|
153
+ list.each_with_index{|values,i|
154
+ (String === values ? values.split("|") : values).each{|v|
155
+ v = v.downcase if case_sensitive
156
+ if found[v].nil?
157
+ found[v] = [k,i]
158
+ else
159
+ last_k, last_i = found[v].values_at(0,1)
160
+ if last_i > i || (last_i == i && last_k.to_i > k.to_i)
161
+ found[v] = [k,i]
162
+ end
163
+ end
164
+ }
165
+ }
166
+ }
167
+
168
+ new_hash = {}
169
+ hash.each{|k,list|
170
+ new_list = []
171
+ list.each_with_index{|values,i|
172
+ new_values = []
173
+ (String === values ? values.split("|") : values).each{|v|
174
+ found_k, found_i = found[(case_sensitive ? v.downcase : v )].values_at(0,1)
175
+ if found_i == i && found_k == k
176
+ new_values << v
177
+ end
178
+ }
179
+ new_list << (String === values ? new_values.join("|") : values)
180
+ }
181
+ new_hash[k] = new_list
182
+ }
183
+ new_hash
184
+ end
185
+
186
+ attr_reader :main, :fields, :data
187
+ def initialize(hash, main, fields = nil)
188
+ @data = hash
189
+ @main = main.to_s
190
+
191
+ if fields.nil?
192
+ l = hash.values.first.length
193
+ fields = []
194
+ l.times{|i| fields << "F#{i}"}
195
+ end
196
+
197
+ @fields = fields.collect{|f| f.to_s}
198
+ end
199
+
200
+ # Wrapper
201
+ def process(field, &block)
202
+ pos = self.field_pos(field)
203
+ @data = ArrayHash.process(self.data, pos, &block)
204
+ self
205
+ end
206
+
207
+ # Returns the position of a given field in the value arrays
208
+ def field_pos(field)
209
+ return :main if field == :main
210
+ if field.downcase == self.main.downcase
211
+ return :main
212
+ else
213
+ @fields.collect{|f| f.downcase}.index(field.to_s.downcase)
214
+ end
215
+ end
216
+
217
+
218
+ # Merge two ArrayHashes using the specified field
219
+ def merge(other, field = :main, options = {} )
220
+ field = self.main if field == :main
221
+
222
+ pos1 = self.field_pos(field)
223
+ pos2 = other.field_pos(field)
224
+
225
+ new = ArrayHash.merge(self.data, other.data, pos1, pos2, options)
226
+ @data = new
227
+ if pos2 == :main
228
+ new_fields = other.fields
229
+ else
230
+ new_fields = other.fields
231
+ new_fields.delete_at(pos2)
232
+ new_fields.unshift(other.main)
233
+ end
234
+ @fields += new_fields
235
+ self
236
+ end
237
+
238
+ # Remove a field from the ArrayHash
239
+ def remove(field)
240
+ pos = self.field_pos(field)
241
+ return if pos.nil?
242
+ @data = self.data.each{|key,values| values.delete_at(pos)}
243
+ @fields.delete_at(pos)
244
+ self
245
+ end
246
+
247
+ def clean
248
+ @data = ArrayHash.clean(@data)
249
+ self
250
+ end
251
+ end
252
+
253
+
254
+
255
+
@@ -0,0 +1,72 @@
1
+ require 'fileutils'
2
+ require 'rbbt'
3
+
4
+ # Provides caching functionality for files downloaded from the internet
5
+ module FileCache
6
+
7
+ class BadPathError < StandardError; end
8
+ class FileExistsError < StandardError; end
9
+
10
+ private
11
+
12
+ # Remove slash characters from filename.
13
+ def self.clean_path(filename)
14
+ filename.gsub(/\//,'_SLASH_')
15
+ end
16
+
17
+ # Check that the file name is safe and is in the correct format
18
+ def self.sanity_check(filename)
19
+ if filename =~ /\//
20
+ raise FileCache::BadPathError, "Character / not allowed in name: #{ filename }"
21
+ end
22
+ if filename !~ /.+\..+/
23
+ raise FileCache::BadPathError, "Filename must have name and extension: name.ext"
24
+ end
25
+ end
26
+
27
+ public
28
+
29
+ # Find the path that a particular file would have in the cache
30
+ def self.path(filename)
31
+ sanity_check(filename)
32
+
33
+ name, extension = filename.match(/(.+)\.(.+)/).values_at(1,2)
34
+ dirs = name.scan(/./).reverse.values_at(0,1,2,3,4).reverse.compact.join('/')
35
+
36
+ return File.join(File.join(Rbbt.cachedir,dirs),filename)
37
+ end
38
+
39
+ # Add a file in the cache. Raise exception if exists, unless force is
40
+ # used.
41
+ def self.add_file(filename, content, options = {})
42
+ sanity_check(filename)
43
+
44
+ path = path(filename)
45
+ FileUtils.makedirs(File.dirname(path), :mode => 0777)
46
+
47
+ if File.exist?(path) and ! (options[:force] || options['force'])
48
+ raise FileCache::FileExistsError, "File #{filename} already in cache"
49
+ end
50
+
51
+ File.open(path,'w'){|f|
52
+ f.write(content)
53
+ }
54
+ FileUtils.chmod 0666, path
55
+
56
+ nil
57
+ end
58
+
59
+ # Removes the file from cache
60
+ def self.del_file(filename)
61
+ sanity_check(filename)
62
+
63
+ path = path(filename)
64
+
65
+ if File.exist? path
66
+ FileUtils.rm path
67
+ end
68
+
69
+ nil
70
+ end
71
+
72
+ end
@@ -0,0 +1,69 @@
1
+ require 'rbbt/util/open'
2
+ require 'rbbt/util/arrayHash'
3
+
4
+ module Index
5
+
6
+ # Creates an inverse index. Takes a file with rows of elements
7
+ # separated by a given pattern (specified by +sep+) and returns a hash
8
+ # where each element points to the first element in the row. +lexicon+
9
+ # is the file containing the data.
10
+ def self.index(lexicon, options = {})
11
+ options = {:sep => "\t|\\|", :case_sensitive => true}.merge(options)
12
+
13
+
14
+ data = Open.to_hash(lexicon, options)
15
+ if options[:clean]
16
+ data = ArrayHash.clean(data)
17
+ end
18
+
19
+ index = {}
20
+
21
+ data.each{|code, id_lists|
22
+ next if code.nil? || code == ""
23
+ id_lists.flatten.compact.uniq.each{|id|
24
+ id = id.downcase unless options[:case_sensitive]
25
+ index[id] = code
26
+ }
27
+ }
28
+ data.each{|code, id_lists|
29
+ next if code.nil? || code == ""
30
+ id = code
31
+ id = id.downcase unless options[:case_sensitive]
32
+ index[id] = code
33
+ }
34
+
35
+ if !options[:case_sensitive]
36
+ class << index; self; end.instance_eval{
37
+ alias_method :old_get, :[]
38
+ define_method(:[], proc{|key| old_get(key.to_s.downcase)})
39
+ }
40
+ end
41
+
42
+ index
43
+ end
44
+ end
45
+
46
+ if __FILE__ == $0
47
+
48
+ require 'benchmark'
49
+
50
+ normal = nil
51
+ puts "Normal " + Benchmark.measure{
52
+ normal = Index.index('/home/miki/rbbt/data/organisms/human/identifiers',:trie => false, :case_sensitive => false)
53
+ }.to_s
54
+
55
+
56
+ ids = Open.read('/home/miki/git/MARQ/test/GDS1375_malignant_vs_normal_up.genes').collect{|l| l.chomp.strip.upcase}
57
+
58
+ new = nil
59
+
60
+ puts ids.inspect
61
+ puts "normal " + Benchmark.measure{
62
+ 100.times{
63
+ new = ids.collect{|id| normal[id]}
64
+ }
65
+ }.to_s
66
+
67
+ puts new.inspect
68
+
69
+ end
@@ -0,0 +1,101 @@
1
+ require 'rbbt'
2
+ require 'rbbt/util/open'
3
+
4
+ $consonants = Open.read(File.join(Rbbt.datadir, 'wordlists/consonants')).collect{|l| l.chomp}.uniq
5
+ class String
6
+ # Uses heuristics to checks if a string seems like a special word, like a gene name.
7
+ def is_special?
8
+ # Only consonants
9
+ return true if self =~ /^[bcdfghjklmnpqrstvwxz]+$/i
10
+
11
+ # Not a word
12
+ return false if self =~ /[^\s]\s[^\s]/;
13
+ return false if self.length < 3;
14
+ # Alphanumeric
15
+ return true if self =~ /[0-9]/ && self =~ /[a-z]/i
16
+ # All Caps
17
+ return true if self =~ /[A-Z]{2,}/;
18
+ # Caps Mix
19
+ return true if self =~ /[a-z][A-Z]/;
20
+ # All consonants
21
+ return true if self =~ /^[a-z]$/i && self !~ /[aeiou]/i
22
+ # Dashed word
23
+ return true if self =~ /(^\w-|-\w$)/
24
+ # To many consonants (very heuristic)
25
+ if self =~ /([^aeiouy]{3,})/i && !$consonants.include?($1.downcase)
26
+ return true
27
+ end
28
+
29
+ return false
30
+ end
31
+
32
+ # Turns the first letter to lowercase
33
+ def downcase_first
34
+ return "" if self == ""
35
+ letters = self.scan(/./)
36
+ letters[0].downcase!
37
+ letters.join("")
38
+ end
39
+
40
+ # Turns a roman number into arabic form is possible. Just simple
41
+ # romans only...
42
+ def arabic
43
+ return 1 if self =~ /^I$/;
44
+ return 2 if self =~ /^II$/;
45
+ return 3 if self =~ /^III$/;
46
+ return 4 if self =~ /^IV$/;
47
+ return 5 if self =~ /^V$/;
48
+ return 10 if self =~ /^X$/;
49
+
50
+ return nil
51
+ end
52
+ end
53
+
54
+
55
+
56
+
57
+ $greek = {
58
+ "alpha" => "a",
59
+ "beta" => "b",
60
+ "gamma" => "g",
61
+ "delta" => "d",
62
+ "epsilon" => "e",
63
+ "zeta" => "z",
64
+ "eta" => "e",
65
+ "theta" => "th",
66
+ "iota" => "i",
67
+ "kappa" => "k",
68
+ "lambda" => "l",
69
+ "mu" => "m",
70
+ "nu" => "n",
71
+ "xi" => "x",
72
+ "omicron" => "o",
73
+ "pi" => "p",
74
+ "rho" => "r",
75
+ "sigma" => "s",
76
+ "tau" => "t",
77
+ "upsilon" => "u",
78
+ "phi" => "ph",
79
+ "chi" => "ch",
80
+ "psi" => "ps",
81
+ "omega" => "o"
82
+ }
83
+
84
+ $inverse_greek = Hash.new
85
+ $greek.each{|l,s| $inverse_greek[s] = l }
86
+ $stopwords = Open.read(File.join(Rbbt.datadir, 'wordlists/stopwords')).scan(/\w+/)
87
+
88
+ class Array
89
+
90
+ # Divides the array into +num+ chunks of the same size by placing one
91
+ # element in each chunk iteratively.
92
+ def chunk(num)
93
+ chunks = []
94
+ each_with_index{|e, i|
95
+ c = i % num
96
+ chunks[c] ||=[]
97
+ chunks[c] << e
98
+ }
99
+ chunks
100
+ end
101
+ end