rbbt 1.0.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (59) hide show
  1. data/LICENSE +20 -0
  2. data/README.rdoc +17 -0
  3. data/bin/rbbt_config +180 -0
  4. data/install_scripts/classifier/R/classify.R +36 -0
  5. data/install_scripts/classifier/Rakefile +140 -0
  6. data/install_scripts/get_abner.sh +2 -0
  7. data/install_scripts/get_banner.sh +25 -0
  8. data/install_scripts/get_biocreative.sh +72 -0
  9. data/install_scripts/get_crf++.sh +26 -0
  10. data/install_scripts/get_entrez.sh +4 -0
  11. data/install_scripts/get_go.sh +4 -0
  12. data/install_scripts/get_polysearch.sh +8 -0
  13. data/install_scripts/ner/Rakefile +206 -0
  14. data/install_scripts/ner/config/default.rb +52 -0
  15. data/install_scripts/norm/Rakefile +218 -0
  16. data/install_scripts/norm/config/cue_default.rb +10 -0
  17. data/install_scripts/norm/config/tokens_default.rb +79 -0
  18. data/install_scripts/norm/functions.sh +21 -0
  19. data/install_scripts/organisms/Rakefile +25 -0
  20. data/install_scripts/organisms/cgd.Rakefile +84 -0
  21. data/install_scripts/organisms/human.Rakefile +145 -0
  22. data/install_scripts/organisms/mgi.Rakefile +77 -0
  23. data/install_scripts/organisms/pombe.Rakefile +40 -0
  24. data/install_scripts/organisms/rake-include.rb +258 -0
  25. data/install_scripts/organisms/rgd.Rakefile +88 -0
  26. data/install_scripts/organisms/sgd.Rakefile +66 -0
  27. data/install_scripts/organisms/tair.Rakefile +54 -0
  28. data/install_scripts/organisms/worm.Rakefile +109 -0
  29. data/install_scripts/stopwords +1 -0
  30. data/install_scripts/wordlists/consonants +897 -0
  31. data/install_scripts/wordlists/stopwords +1 -0
  32. data/lib/rbbt/bow/bow.rb +87 -0
  33. data/lib/rbbt/bow/classifier.rb +118 -0
  34. data/lib/rbbt/bow/dictionary.rb +218 -0
  35. data/lib/rbbt/ner/abner.rb +34 -0
  36. data/lib/rbbt/ner/banner.rb +73 -0
  37. data/lib/rbbt/ner/regexpNER.rb +62 -0
  38. data/lib/rbbt/ner/rner.rb +227 -0
  39. data/lib/rbbt/ner/rnorm/cue_index.rb +80 -0
  40. data/lib/rbbt/ner/rnorm/tokens.rb +213 -0
  41. data/lib/rbbt/ner/rnorm.rb +142 -0
  42. data/lib/rbbt/sources/biocreative.rb +75 -0
  43. data/lib/rbbt/sources/biomart.rb +106 -0
  44. data/lib/rbbt/sources/entrez.rb +211 -0
  45. data/lib/rbbt/sources/go.rb +40 -0
  46. data/lib/rbbt/sources/organism.rb +197 -0
  47. data/lib/rbbt/sources/polysearch.rb +88 -0
  48. data/lib/rbbt/sources/pubmed.rb +111 -0
  49. data/lib/rbbt/util/arrayHash.rb +255 -0
  50. data/lib/rbbt/util/filecache.rb +72 -0
  51. data/lib/rbbt/util/index.rb +69 -0
  52. data/lib/rbbt/util/misc.rb +101 -0
  53. data/lib/rbbt/util/open.rb +207 -0
  54. data/lib/rbbt/util/simpleDSL.rb +87 -0
  55. data/lib/rbbt/util/tmpfile.rb +19 -0
  56. data/lib/rbbt/version.rb +10 -0
  57. data/lib/rbbt.rb +86 -0
  58. data/tasks/install.rake +123 -0
  59. metadata +114 -0
@@ -0,0 +1,88 @@
1
+ require 'rbbt'
2
+ require 'rbbt/util/open'
3
+ require 'rbbt/ner/regexpNER'
4
+
5
+ # Find terms in the Polysearch thesauri using simple regular expression
6
+ # matching. Note that the first time the methods are used the correspondent
7
+ # thesaurus are loaded into memory. The available thesauri are: disease, drug,
8
+ # metabolite, organ, subcellular (subcellular localization) and tissue.
9
+ module Polysearch
10
+
11
+
12
+ @@names = {}
13
+ def self.type_names(type) #:nodoc:
14
+ @@names[type] ||= Open.to_hash(File.join(Rbbt.datadir,'dbs','polysearch',type + '.txt'), :single => true)
15
+ end
16
+
17
+
18
+ @@indexes = {}
19
+ def self.type_index(type) #:nodoc:
20
+ @@indexes[type] ||= RegExpNER.new(File.join(Rbbt.datadir,'dbs','polysearch',type + '.txt'))
21
+ end
22
+
23
+ # Find matches in a string of text, the types array specifies which thesauri
24
+ # to use, if if nil it will use all.
25
+ def self.match(text, types = nil)
26
+ if types.nil?
27
+ types = %w(disease drug metabolite organ subcellular tissue)
28
+ end
29
+
30
+ types = [types] unless Array === types
31
+ types = types.sort
32
+
33
+ matches = {}
34
+ types.collect{|type|
35
+ matches.merge!(type_index(type).match_hash(text))
36
+ }
37
+
38
+ matches
39
+ end
40
+
41
+ # Transform the code into a name, type is the thesaurus to use
42
+ def self.name(type, code)
43
+ type_names(type)[code]
44
+ end
45
+
46
+ end
47
+
48
+ if __FILE__ == $0
49
+
50
+ text =<<-EOT
51
+
52
+ Background Microorganisms adapt their transcriptome by integrating
53
+ multiple chemical and physical signals from their environment. Shake-flask
54
+ cultivation does not allow precise manipulation of individual culture
55
+ parameters and therefore precludes a quantitative analysis of the
56
+ (combinatorial) influence of these parameters on transcriptional
57
+ regulation. Steady-state chemostat cultures, which do enable accurate
58
+ control, measurement and manipulation of individual cultivation parameters
59
+ (e.g. specific growth rate, temperature, identity of the growth-limiting
60
+ nutrient) appear to provide a promising experimental platform for such a
61
+ combinatorial analysis. Results A microarray compendium of 170
62
+ steady-state chemostat cultures of the yeast Saccharomyces cerevisiae is
63
+ presented and analyzed. The 170 microarrays encompass 55 unique
64
+ conditions, which can be characterized by the combined settings of 10
65
+ different cultivation parameters. By applying a regression model to assess
66
+ the impact of (combinations of) cultivation parameters on the
67
+ transcriptome, most S. cerevisiae genes were shown to be influenced by
68
+ multiple cultivation parameters, and in many cases by combinatorial
69
+ effects of cultivation parameters. The inclusion of these combinatorial
70
+ effects in the regression model led to higher explained variance of the
71
+ gene expression patterns and resulted in higher function enrichment in
72
+ subsequent analysis. We further demonstrate the usefulness of the
73
+ compendium and regression analysis for interpretation of shake-flask-based
74
+ transcriptome studies and for guiding functional analysis of
75
+ (uncharacterized) genes and pathways. Conclusions Modeling the
76
+ combinatorial effects of environmental parameters on the transcriptome is
77
+ crucial for understanding transcriptional regulation. Chemostat
78
+ cultivation offers a powerful tool for such an approach. Keywords:
79
+ chemostat steady state samples
80
+ Cerebellar
81
+ stroke syndrome
82
+
83
+
84
+ EOT
85
+
86
+ p Polysearch.match(text,'disease').values.flatten
87
+
88
+ end
@@ -0,0 +1,111 @@
1
+ require 'rbbt/util/filecache'
2
+ require 'rbbt/util/open'
3
+ require 'rbbt'
4
+
5
+ # This module offers an interface with PubMed, to perform queries, and
6
+ # retrieve simple information from articles. It uses the caching
7
+ # services of Rbbt.
8
+ module PubMed
9
+
10
+ private
11
+ @@last = Time.now
12
+ @@pubmed_lag = 1
13
+ def self.get_online(pmids)
14
+
15
+ pmid_list = ( pmids.is_a?(Array) ? pmids.join(',') : pmids.to_s )
16
+ url = "http://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=pubmed&retmode=xml&id=#{pmid_list}"
17
+
18
+ diff = Time.now - @@last
19
+ sleep @@pubmed_lag - diff unless diff > @@pubmed_lag
20
+
21
+ xml = Open.read(url, :quiet => true, :nocache => true)
22
+
23
+ @@last = Time.now
24
+
25
+ articles = xml.scan(/(<PubmedArticle>.*?<\/PubmedArticle>)/sm).flatten
26
+
27
+ if pmids.is_a? Array
28
+ list = {}
29
+ articles.each{|article|
30
+ pmid = article.scan(/<PMID>(.*?)<\/PMID>/).flatten.first
31
+ list[pmid] = article
32
+ }
33
+ return list
34
+ else
35
+ return articles.first
36
+ end
37
+
38
+ end
39
+
40
+ public
41
+
42
+ # Processes the xml with an articles as served by MedLine and extracts
43
+ # the abstract, title and journal information
44
+ class Article
45
+ attr_reader :title, :abstract, :journal
46
+ def initialize(xml)
47
+ xml ||= ""
48
+ @abstract = $1 if xml.match(/<AbstractText>(.*)<\/AbstractText>/sm)
49
+ @title = $1 if xml.match(/<ArticleTitle>(.*)<\/ArticleTitle>/sm)
50
+ @journal = $1 if xml.match(/<Title>(.*)<\/Title>/sm)
51
+ end
52
+
53
+ # Join the text from title and abstract
54
+ def text
55
+ [@title, @abstract].join("\n")
56
+ end
57
+ end
58
+
59
+ # Returns the Article object containing the information for the PubMed
60
+ # ID specified as an argument. If +pmid+ is an array instead of a single
61
+ # identifier it returns an hash with the Article object for each id.
62
+ # It uses the Rbbt cache to save the articles xml.
63
+ def self.get_article(pmid)
64
+
65
+ if pmid.is_a? Array
66
+ missing = []
67
+ list = {}
68
+
69
+ pmid.each{|p|
70
+ filename = p.to_s + '.xml'
71
+ if File.exists? FileCache.path(filename)
72
+ list[p] = Article.new(Open.read(FileCache.path(filename)))
73
+ else
74
+ missing << p
75
+ end
76
+ }
77
+
78
+ return list unless missing.any?
79
+ articles = get_online(missing)
80
+
81
+ articles.each{|p, xml|
82
+ filename = p + '.xml'
83
+ FileCache.add_file(filename,xml, :force => true)
84
+ list[p] = Article.new(xml)
85
+ }
86
+
87
+ return list
88
+
89
+ else
90
+ filename = pmid.to_s + '.xml'
91
+
92
+ if File.exists? FileCache.path(filename)
93
+ return Article.new(Open.read(FileCache.path(filename)))
94
+ else
95
+ xml = get_online(pmid)
96
+ FileCache.add_file(filename,xml)
97
+
98
+ return Article.new(xml)
99
+ end
100
+ end
101
+ end
102
+
103
+ # Performs the specified query and returns an array with the PubMed
104
+ # Ids returned. +retmax+ can be used to limit the number of ids
105
+ # returned, if is not specified 30000 is used.
106
+ def self.query(query, retmax=nil)
107
+ retmax ||= 30000
108
+
109
+ Open.read("http://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?retmax=#{retmax}&db=pubmed&term=#{query}",:quiet => true, :nocache => true).scan(/<Id>(\d+)<\/Id>/).flatten
110
+ end
111
+ end
@@ -0,0 +1,255 @@
1
+
2
+ class ArrayHash
3
+
4
+ # Take two strings of elements separated by the character sep_char and join them
5
+ # into one, removing repetitions.
6
+ def self.merge_values_string(list1, list2, sep_char ='|')
7
+ elem1 = list1.to_s.split(sep_char)
8
+ elem2 = list2.to_s.split(sep_char)
9
+ (elem1 + elem2).select{|e| e.to_s != ""}.uniq.join(sep_char)
10
+ end
11
+
12
+ # Merge two lists of elements. Elements could be strings of elements
13
+ # separated by the character sep_char, or arrays of lists of such strings.
14
+ def self.merge_values(list1, list2, sep_char = "|")
15
+ if String === list1 || String === list2
16
+ return merge_values_string(list1, list2)
17
+ end
18
+
19
+ if list1.nil?
20
+ list1 = [''] * list2.length
21
+ end
22
+
23
+ if list2.nil?
24
+ list2 = [''] * list1.length
25
+ end
26
+
27
+ new = []
28
+ list1.each_with_index{|elem, i|
29
+ new << merge_values_string(elem, list2[i], sep_char)
30
+ }
31
+ new
32
+ end
33
+
34
+
35
+ # Take an hash of arrays and a position and use the value at that position
36
+ # of the arrays and build a new hash with that value as key, and the original
37
+ # key prepended to the arrays. The options hash appcepts the following keys
38
+ # :case_insensitive, which defaults to true, and :index, which indicates that
39
+ # the original key should be the value of the hash entry, instead of the
40
+ # complete array of values.
41
+ def self.pullout(hash, pos, options = {})
42
+ index = options[:index]; index = false if index.nil?
43
+ case_insensitive = options[:case_insensitive]; case_insensitive = true if case_insensitive.nil?
44
+
45
+ new = {}
46
+ hash.each{|key,values|
47
+ code = values[pos].to_s
48
+ next if code == ""
49
+
50
+ if index
51
+ list = key
52
+ else
53
+ list = [key] + values
54
+ list.delete_at(pos + 1)
55
+ end
56
+
57
+ code.split("|").each{|c|
58
+ c = c.downcase if case_insensitive
59
+ new[c] = merge_values(new[c], list)
60
+ }
61
+ }
62
+
63
+ if case_insensitive
64
+ class << new; self; end.instance_eval{
65
+ alias_method :old_get, :[]
66
+ define_method(:[], proc{|key| old_get(key.to_s.downcase)})
67
+ }
68
+ end
69
+
70
+ new
71
+ end
72
+
73
+ # Merge to hashes of arrays. Each hash contains a number of fields for each
74
+ # entry. The pos1 and pos2 indicate what fields should be used to match
75
+ # entries, the values for pos1 and pos2 can be an integer indicating the
76
+ # position in the array or the symbol :main to refer to the key of the hash.
77
+ # The options hash accepts the key :case_insensitive, which defaults to true.
78
+ def self.merge(hash1, hash2, pos1 = :main, pos2 = :main, options = {})
79
+
80
+ case_insensitive = options[:case_insensitive]; case_insensitive = true if case_insensitive.nil?
81
+ if pos1.to_s.downcase != 'main'
82
+ index1 = pullout(hash1, pos1, options.merge(:index => true))
83
+ elsif options[:case_insensitive]
84
+ new = {}
85
+ hash1.each{|k,v|
86
+ new[k.to_s.downcase] = v
87
+ }
88
+ class << new; self; end.instance_eval{
89
+ alias_method :old_get, :[]
90
+ define_method(:[], proc{|key| old_get(key.to_s.downcase)})
91
+ }
92
+ hash1 = new
93
+ end
94
+
95
+ length1 = hash1.values.first.length
96
+ length2 = hash2.values.first.length
97
+
98
+ new = {}
99
+ hash2.each{|key, values|
100
+ case
101
+ when pos2.to_s.downcase == 'main'
102
+ k = key
103
+ v = values
104
+ when Fixnum === pos2
105
+ k = values[pos2]
106
+ v = values
107
+ v.delete_at(pos2)
108
+ v.unshift(key)
109
+ else
110
+ raise "Format of second index not understood"
111
+ end
112
+
113
+ code = (index1.nil? ? k : index1[k])
114
+ if code
115
+ code.split('|').each{|c|
116
+ c = c.to_s.downcase if options[:case_insensitive]
117
+ new[c] = hash1[c] || [''] * length1
118
+ new[c] += v
119
+ }
120
+ end
121
+ }
122
+
123
+ hash1.each{|key, values|
124
+ new[key] ||= values + [''] * length2
125
+ }
126
+
127
+ new
128
+ end
129
+
130
+ # For a given hash of arrays, filter the position pos of each array with the
131
+ # block of code.
132
+ def self.process(hash, pos, &block)
133
+ new = {}
134
+ hash.each{|key, values|
135
+ v = values
136
+ v[pos] = v[pos].to_s.split("|").collect{|n| block.call(n)}.join("|")
137
+ new[key] = v
138
+ }
139
+ new
140
+ end
141
+
142
+ # Clean structure for repeated values. If the same value apear two times use
143
+ # eliminate the one that appears latter on the values list (columns of the
144
+ # ArrayHash are assumed to be sorted for importance) if the appear on the
145
+ # same position, remove the one with the smaller vale of the code after
146
+ # turning it into integer.
147
+ def self.clean(hash, options = {})
148
+ case_sensitive = options[:case_sensitive]
149
+
150
+ found = {}
151
+
152
+ hash.each{|k, list|
153
+ list.each_with_index{|values,i|
154
+ (String === values ? values.split("|") : values).each{|v|
155
+ v = v.downcase if case_sensitive
156
+ if found[v].nil?
157
+ found[v] = [k,i]
158
+ else
159
+ last_k, last_i = found[v].values_at(0,1)
160
+ if last_i > i || (last_i == i && last_k.to_i > k.to_i)
161
+ found[v] = [k,i]
162
+ end
163
+ end
164
+ }
165
+ }
166
+ }
167
+
168
+ new_hash = {}
169
+ hash.each{|k,list|
170
+ new_list = []
171
+ list.each_with_index{|values,i|
172
+ new_values = []
173
+ (String === values ? values.split("|") : values).each{|v|
174
+ found_k, found_i = found[(case_sensitive ? v.downcase : v )].values_at(0,1)
175
+ if found_i == i && found_k == k
176
+ new_values << v
177
+ end
178
+ }
179
+ new_list << (String === values ? new_values.join("|") : values)
180
+ }
181
+ new_hash[k] = new_list
182
+ }
183
+ new_hash
184
+ end
185
+
186
+ attr_reader :main, :fields, :data
187
+ def initialize(hash, main, fields = nil)
188
+ @data = hash
189
+ @main = main.to_s
190
+
191
+ if fields.nil?
192
+ l = hash.values.first.length
193
+ fields = []
194
+ l.times{|i| fields << "F#{i}"}
195
+ end
196
+
197
+ @fields = fields.collect{|f| f.to_s}
198
+ end
199
+
200
+ # Wrapper
201
+ def process(field, &block)
202
+ pos = self.field_pos(field)
203
+ @data = ArrayHash.process(self.data, pos, &block)
204
+ self
205
+ end
206
+
207
+ # Returns the position of a given field in the value arrays
208
+ def field_pos(field)
209
+ return :main if field == :main
210
+ if field.downcase == self.main.downcase
211
+ return :main
212
+ else
213
+ @fields.collect{|f| f.downcase}.index(field.to_s.downcase)
214
+ end
215
+ end
216
+
217
+
218
+ # Merge two ArrayHashes using the specified field
219
+ def merge(other, field = :main, options = {} )
220
+ field = self.main if field == :main
221
+
222
+ pos1 = self.field_pos(field)
223
+ pos2 = other.field_pos(field)
224
+
225
+ new = ArrayHash.merge(self.data, other.data, pos1, pos2, options)
226
+ @data = new
227
+ if pos2 == :main
228
+ new_fields = other.fields
229
+ else
230
+ new_fields = other.fields
231
+ new_fields.delete_at(pos2)
232
+ new_fields.unshift(other.main)
233
+ end
234
+ @fields += new_fields
235
+ self
236
+ end
237
+
238
+ # Remove a field from the ArrayHash
239
+ def remove(field)
240
+ pos = self.field_pos(field)
241
+ return if pos.nil?
242
+ @data = self.data.each{|key,values| values.delete_at(pos)}
243
+ @fields.delete_at(pos)
244
+ self
245
+ end
246
+
247
+ def clean
248
+ @data = ArrayHash.clean(@data)
249
+ self
250
+ end
251
+ end
252
+
253
+
254
+
255
+
@@ -0,0 +1,72 @@
1
+ require 'fileutils'
2
+ require 'rbbt'
3
+
4
+ # Provides caching functionality for files downloaded from the internet
5
+ module FileCache
6
+
7
+ class BadPathError < StandardError; end
8
+ class FileExistsError < StandardError; end
9
+
10
+ private
11
+
12
+ # Remove slash characters from filename.
13
+ def self.clean_path(filename)
14
+ filename.gsub(/\//,'_SLASH_')
15
+ end
16
+
17
+ # Check that the file name is safe and is in the correct format
18
+ def self.sanity_check(filename)
19
+ if filename =~ /\//
20
+ raise FileCache::BadPathError, "Character / not allowed in name: #{ filename }"
21
+ end
22
+ if filename !~ /.+\..+/
23
+ raise FileCache::BadPathError, "Filename must have name and extension: name.ext"
24
+ end
25
+ end
26
+
27
+ public
28
+
29
+ # Find the path that a particular file would have in the cache
30
+ def self.path(filename)
31
+ sanity_check(filename)
32
+
33
+ name, extension = filename.match(/(.+)\.(.+)/).values_at(1,2)
34
+ dirs = name.scan(/./).reverse.values_at(0,1,2,3,4).reverse.compact.join('/')
35
+
36
+ return File.join(File.join(Rbbt.cachedir,dirs),filename)
37
+ end
38
+
39
+ # Add a file in the cache. Raise exception if exists, unless force is
40
+ # used.
41
+ def self.add_file(filename, content, options = {})
42
+ sanity_check(filename)
43
+
44
+ path = path(filename)
45
+ FileUtils.makedirs(File.dirname(path), :mode => 0777)
46
+
47
+ if File.exist?(path) and ! (options[:force] || options['force'])
48
+ raise FileCache::FileExistsError, "File #{filename} already in cache"
49
+ end
50
+
51
+ File.open(path,'w'){|f|
52
+ f.write(content)
53
+ }
54
+ FileUtils.chmod 0666, path
55
+
56
+ nil
57
+ end
58
+
59
+ # Removes the file from cache
60
+ def self.del_file(filename)
61
+ sanity_check(filename)
62
+
63
+ path = path(filename)
64
+
65
+ if File.exist? path
66
+ FileUtils.rm path
67
+ end
68
+
69
+ nil
70
+ end
71
+
72
+ end
@@ -0,0 +1,69 @@
1
+ require 'rbbt/util/open'
2
+ require 'rbbt/util/arrayHash'
3
+
4
+ module Index
5
+
6
+ # Creates an inverse index. Takes a file with rows of elements
7
+ # separated by a given pattern (specified by +sep+) and returns a hash
8
+ # where each element points to the first element in the row. +lexicon+
9
+ # is the file containing the data.
10
+ def self.index(lexicon, options = {})
11
+ options = {:sep => "\t|\\|", :case_sensitive => true}.merge(options)
12
+
13
+
14
+ data = Open.to_hash(lexicon, options)
15
+ if options[:clean]
16
+ data = ArrayHash.clean(data)
17
+ end
18
+
19
+ index = {}
20
+
21
+ data.each{|code, id_lists|
22
+ next if code.nil? || code == ""
23
+ id_lists.flatten.compact.uniq.each{|id|
24
+ id = id.downcase unless options[:case_sensitive]
25
+ index[id] = code
26
+ }
27
+ }
28
+ data.each{|code, id_lists|
29
+ next if code.nil? || code == ""
30
+ id = code
31
+ id = id.downcase unless options[:case_sensitive]
32
+ index[id] = code
33
+ }
34
+
35
+ if !options[:case_sensitive]
36
+ class << index; self; end.instance_eval{
37
+ alias_method :old_get, :[]
38
+ define_method(:[], proc{|key| old_get(key.to_s.downcase)})
39
+ }
40
+ end
41
+
42
+ index
43
+ end
44
+ end
45
+
46
+ if __FILE__ == $0
47
+
48
+ require 'benchmark'
49
+
50
+ normal = nil
51
+ puts "Normal " + Benchmark.measure{
52
+ normal = Index.index('/home/miki/rbbt/data/organisms/human/identifiers',:trie => false, :case_sensitive => false)
53
+ }.to_s
54
+
55
+
56
+ ids = Open.read('/home/miki/git/MARQ/test/GDS1375_malignant_vs_normal_up.genes').collect{|l| l.chomp.strip.upcase}
57
+
58
+ new = nil
59
+
60
+ puts ids.inspect
61
+ puts "normal " + Benchmark.measure{
62
+ 100.times{
63
+ new = ids.collect{|id| normal[id]}
64
+ }
65
+ }.to_s
66
+
67
+ puts new.inspect
68
+
69
+ end
@@ -0,0 +1,101 @@
1
+ require 'rbbt'
2
+ require 'rbbt/util/open'
3
+
4
+ $consonants = Open.read(File.join(Rbbt.datadir, 'wordlists/consonants')).collect{|l| l.chomp}.uniq
5
+ class String
6
+ # Uses heuristics to checks if a string seems like a special word, like a gene name.
7
+ def is_special?
8
+ # Only consonants
9
+ return true if self =~ /^[bcdfghjklmnpqrstvwxz]+$/i
10
+
11
+ # Not a word
12
+ return false if self =~ /[^\s]\s[^\s]/;
13
+ return false if self.length < 3;
14
+ # Alphanumeric
15
+ return true if self =~ /[0-9]/ && self =~ /[a-z]/i
16
+ # All Caps
17
+ return true if self =~ /[A-Z]{2,}/;
18
+ # Caps Mix
19
+ return true if self =~ /[a-z][A-Z]/;
20
+ # All consonants
21
+ return true if self =~ /^[a-z]$/i && self !~ /[aeiou]/i
22
+ # Dashed word
23
+ return true if self =~ /(^\w-|-\w$)/
24
+ # To many consonants (very heuristic)
25
+ if self =~ /([^aeiouy]{3,})/i && !$consonants.include?($1.downcase)
26
+ return true
27
+ end
28
+
29
+ return false
30
+ end
31
+
32
+ # Turns the first letter to lowercase
33
+ def downcase_first
34
+ return "" if self == ""
35
+ letters = self.scan(/./)
36
+ letters[0].downcase!
37
+ letters.join("")
38
+ end
39
+
40
+ # Turns a roman number into arabic form is possible. Just simple
41
+ # romans only...
42
+ def arabic
43
+ return 1 if self =~ /^I$/;
44
+ return 2 if self =~ /^II$/;
45
+ return 3 if self =~ /^III$/;
46
+ return 4 if self =~ /^IV$/;
47
+ return 5 if self =~ /^V$/;
48
+ return 10 if self =~ /^X$/;
49
+
50
+ return nil
51
+ end
52
+ end
53
+
54
+
55
+
56
+
57
+ $greek = {
58
+ "alpha" => "a",
59
+ "beta" => "b",
60
+ "gamma" => "g",
61
+ "delta" => "d",
62
+ "epsilon" => "e",
63
+ "zeta" => "z",
64
+ "eta" => "e",
65
+ "theta" => "th",
66
+ "iota" => "i",
67
+ "kappa" => "k",
68
+ "lambda" => "l",
69
+ "mu" => "m",
70
+ "nu" => "n",
71
+ "xi" => "x",
72
+ "omicron" => "o",
73
+ "pi" => "p",
74
+ "rho" => "r",
75
+ "sigma" => "s",
76
+ "tau" => "t",
77
+ "upsilon" => "u",
78
+ "phi" => "ph",
79
+ "chi" => "ch",
80
+ "psi" => "ps",
81
+ "omega" => "o"
82
+ }
83
+
84
+ $inverse_greek = Hash.new
85
+ $greek.each{|l,s| $inverse_greek[s] = l }
86
+ $stopwords = Open.read(File.join(Rbbt.datadir, 'wordlists/stopwords')).scan(/\w+/)
87
+
88
+ class Array
89
+
90
+ # Divides the array into +num+ chunks of the same size by placing one
91
+ # element in each chunk iteratively.
92
+ def chunk(num)
93
+ chunks = []
94
+ each_with_index{|e, i|
95
+ c = i % num
96
+ chunks[c] ||=[]
97
+ chunks[c] << e
98
+ }
99
+ chunks
100
+ end
101
+ end