rbbt-text 0.0.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,87 @@
1
+ require 'rbbt'
2
+ require 'rbbt/bow/misc'
3
+ require 'stemmer'
4
+
5
+ # This module provides methods to extract a bag of words (or bag of bigrams)
6
+ # representation for strings of text, and to produce a vector representations
7
+ # of that bag of words for a given list of terms. This BOW representations of
8
+ # the texts is usually first used to build a Dictionary, and then, with the
9
+ # best selection of terms as determined by the Dictionary::TF_IDF.best of
10
+ # Dictionary::KL.best methods, determine the vector representations for that
11
+ # text.
12
+ module BagOfWords
13
+ # Divide the input string into an array of words (sequences of \w characters).
14
+ # Words are stemmed and filtered to remove stopwords and words with less than
15
+ # 2 characters. The list of stopwords is a global variable defined in
16
+ # 'rbbt/util/misc'.
17
+ def self.words(text)
18
+ return [] if text.nil?
19
+ raise "Stopword list not loaded. Have you installed the wordlists? (rbbt_config prepare wordlists)" if $stopwords.nil?
20
+ text.scan(/\w+/).
21
+ collect{|word| word.downcase.stem}.
22
+ select{|word|
23
+ ! $stopwords.include?(word) &&
24
+ word.length > 2 &&
25
+ word =~ /[a-z]/
26
+ }
27
+ end
28
+
29
+ # Take the array of words for the text and form all the bigrams
30
+ def self.bigrams(text)
31
+ words = words(text)
32
+ bigrams = []
33
+ lastword = nil
34
+
35
+ words.each{|word|
36
+ if lastword
37
+ bigrams << "#{lastword} #{word}"
38
+ end
39
+ lastword = word
40
+ }
41
+
42
+ words + bigrams
43
+ end
44
+
45
+ # Given an array of terms return a hash with the number of appearances of
46
+ # each term
47
+ def self.count(terms)
48
+ count = Hash.new(0)
49
+ terms.each{|word| count[word] += 1}
50
+ count
51
+ end
52
+
53
+
54
+ # Given a string of text find all the words (or bigrams) and return a hash
55
+ # with their counts
56
+ def self.terms(text, bigrams = true)
57
+
58
+ if bigrams
59
+ count(bigrams(text))
60
+ else
61
+ count(words(text))
62
+ end
63
+ end
64
+
65
+ # Given a string of text and a list of terms, which may or may not contain
66
+ # bigrams, return an array with one entry per term which holds the number of
67
+ # occurrences of each term in the text.
68
+ def self.features(text, terms, bigrams = nil)
69
+ bigrams ||= terms.select{|term| term =~ / /}.any?
70
+ count = bigrams ? count(bigrams(text)) : count(words(text))
71
+ count.values_at(*terms)
72
+ end
73
+ end
74
+
75
+ class String
76
+ # Shortcut for BagOfWords.words(self)
77
+ def words
78
+ BagOfWords.words(self)
79
+ end
80
+
81
+ # Shortcut for BagOfWords.bigrams(self)
82
+ def bigrams
83
+ BagOfWords.bigrams(self)
84
+ end
85
+ end
86
+
87
+
@@ -0,0 +1,187 @@
1
+ class Dictionary
2
+ attr_reader :terms
3
+ def initialize
4
+ @terms = Hash.new(0)
5
+ end
6
+
7
+ def add(terms, &block)
8
+ terms.each{|term, count|
9
+ @terms[term] += count
10
+ }
11
+ end
12
+ end
13
+
14
+ class Dictionary::TF_IDF
15
+ attr_reader :terms, :docs, :total_terms, :num_docs
16
+
17
+ def initialize(options = {})
18
+ @term_limit = {
19
+ :limit => 500_000,
20
+ }.merge(options)[:limit]
21
+
22
+ @terms = Hash.new(0)
23
+ @docs = Hash.new(0)
24
+ @num_docs = 0
25
+ @total_terms = 0
26
+ end
27
+
28
+
29
+ def add(terms)
30
+ if @term_limit && @terms.length > @term_limit
31
+ terms = terms.delete_if{|term, count| !@terms.include? term }
32
+ end
33
+
34
+ terms.each{|term, count|
35
+ @terms[term] += count
36
+ @total_terms += count
37
+ @docs[term] += 1
38
+ }
39
+ @num_docs += 1
40
+ end
41
+
42
+ def df
43
+ df = Hash.new(0)
44
+ @docs.each{|term, count|
45
+ df[term] = count.to_f / @num_docs
46
+ }
47
+ df
48
+ end
49
+
50
+ def tf
51
+ tf = Hash.new(0)
52
+ @terms.each{|term, count|
53
+ tf[term] = count.to_f / @total_terms
54
+ }
55
+ tf
56
+ end
57
+
58
+ def idf
59
+ idf = Hash.new(0)
60
+ num_docs = @num_docs.to_f
61
+ @docs.each{|term, count|
62
+ idf[term] = Math::log(num_docs / count)
63
+ }
64
+ idf
65
+ end
66
+
67
+ def tf_idf
68
+ tf_idf = Hash.new(0)
69
+ num_docs = @num_docs.to_f
70
+ @docs.each{|term, count|
71
+ tf_idf[term] = @terms[term].to_f / @total_terms * Math::log(num_docs / count)
72
+ }
73
+ tf_idf
74
+ end
75
+
76
+ def best(options = {})
77
+ hi, low, limit = {
78
+ :low => 0,
79
+ :hi => 1,
80
+ }.merge(options).
81
+ values_at(:hi, :low, :limit)
82
+
83
+ num_docs = @num_docs.to_f
84
+ best = df.select{|term, value|
85
+ value >= low && value <= hi
86
+ }.collect{|p|
87
+ term = p.first
88
+ df_value = p.last
89
+ [term,
90
+ @terms[term].to_f / num_docs * Math::log(1.0/df_value)
91
+ ]
92
+ }
93
+ if limit
94
+ Hash[*best.sort{|a,b| b[1] <=> a[1]}.slice(0, limit).flatten]
95
+ else
96
+ Hash[*best.flatten]
97
+ end
98
+ end
99
+
100
+ def weights(options = {})
101
+ best_terms = best(options).keys
102
+ weights = {}
103
+
104
+ num_docs = @num_docs.to_f
105
+ best_terms.each{|term|
106
+ weights[term] = Math::log(num_docs / @docs[term])
107
+ }
108
+ weights
109
+ end
110
+
111
+ end
112
+
113
+ class Dictionary::KL
114
+ attr_reader :pos_dict, :neg_dict
115
+
116
+ def initialize(options = {})
117
+ @pos_dict = Dictionary::TF_IDF.new(options)
118
+ @neg_dict = Dictionary::TF_IDF.new(options)
119
+ end
120
+
121
+ def terms
122
+ (pos_dict.terms.keys + neg_dict.terms.keys).uniq
123
+ end
124
+
125
+ def add(terms, c)
126
+ dict = (c == :+ || c == '+' ? @pos_dict : @neg_dict)
127
+ dict.add(terms)
128
+ end
129
+
130
+ def kl
131
+ kl = {}
132
+ pos_df = @pos_dict.df
133
+ neg_df = @neg_dict.df
134
+
135
+ terms.each{|term|
136
+ pos = pos_df[term]
137
+ neg = neg_df[term]
138
+
139
+ pos = 0.000001 if pos == 0
140
+ pos = 0.999999 if pos == 1
141
+ neg = 0.000001 if neg == 0
142
+ neg = 0.999999 if neg == 1
143
+
144
+ kl[term] = pos * Math::log(pos / neg) + neg * Math::log(neg / pos)
145
+ }
146
+ kl
147
+ end
148
+
149
+ def best(options = {})
150
+ hi, low, limit = {
151
+ :low => 0,
152
+ :hi => 1,
153
+ }.merge(options).
154
+ values_at(:hi, :low, :limit)
155
+
156
+ pos_df = @pos_dict.df
157
+ neg_df = @neg_dict.df
158
+
159
+ best = {}
160
+ terms.select{|term|
161
+ pos_df[term] >= low && pos_df[term] <= hi ||
162
+ neg_df[term] >= low && neg_df[term] <= hi
163
+ }.each{|term|
164
+ pos = pos_df[term]
165
+ neg = neg_df[term]
166
+
167
+ pos = 0.000001 if pos == 0
168
+ pos = 0.999999 if pos == 1
169
+ neg = 0.000001 if neg == 0
170
+ neg = 0.999999 if neg == 1
171
+
172
+ best[term] = pos * Math::log(pos / neg) + neg * Math::log(neg / pos)
173
+ }
174
+ if limit
175
+ Hash[*best.sort{|a,b| b[1] <=> a[1]}.slice(0, limit).flatten]
176
+ else
177
+ Hash[*best.flatten]
178
+ end
179
+ end
180
+
181
+ def weights(options = {})
182
+ best(options)
183
+ end
184
+
185
+
186
+
187
+ end
@@ -0,0 +1,7 @@
1
+ require 'rbbt'
2
+ require 'rbbt/util/open'
3
+
4
+ Rbbt.add_datafiles 'stopwords' => ['wordlists', 'stopwords']
5
+
6
+ $stopwords = Open.read(Rbbt.find_datafile 'stopwords').scan(/\w+/) if File.exists?(Rbbt.find_datafile 'stopwords')
7
+
@@ -0,0 +1,61 @@
1
+ require 'rbbt-util'
2
+ require 'rbbt/bow/misc'
3
+
4
+ class RegExpNER
5
+
6
+ def self.build_re(names, ignorecase=true)
7
+ res = names.compact.reject{|n| n.empty?}.
8
+ sort_by{|a| a.length}.reverse.collect{|n| Regexp.quote(n) }
9
+
10
+ /\b(#{ res.join("|").gsub(/\\?\s/,'\s+') })\b/
11
+ end
12
+
13
+ def initialize(lexicon, options = {})
14
+ options = Misc.add_defaults options, :flatten => true, :case_insensitive => true, :stopwords => nil
15
+
16
+ if $stopwords and (options[:stopwords].nil? || options[:stopwords] == true)
17
+ options[:stopwords] = $stopwords
18
+ else
19
+ options[:stopwords] = []
20
+ end
21
+
22
+ data = TSV.new(lexicon, options)
23
+
24
+ @index = {}
25
+ data.collect{|code, names|
26
+ next if code.nil? || code == ""
27
+ if options[:stopwords].any?
28
+ names = names.select{|n|
29
+ ! options[:stopwords].include?(options[:case_insensitive] ? n.downcase : n)
30
+ }
31
+ end
32
+ @index[code] = RegExpNER.build_re(names, options[:case_insensitive])
33
+ }
34
+ end
35
+
36
+ def self.match_re(text, res)
37
+ res = [res] unless Array === res
38
+
39
+ res.collect{|re|
40
+ text.scan(re)
41
+ }.flatten
42
+ end
43
+
44
+ def match_hash(text)
45
+ return {} if text.nil? or text.empty?
46
+ matches = {}
47
+ @index.each{|code, re|
48
+ RegExpNER.match_re(text, re).each{|match|
49
+ matches[code] ||= []
50
+ matches[code] << match
51
+ }
52
+ }
53
+ matches
54
+ end
55
+
56
+ def match(text)
57
+ match_hash(text)
58
+ end
59
+
60
+ end
61
+
@@ -0,0 +1,30 @@
1
+ require File.expand_path(File.dirname(__FILE__) + '/../../test_helper')
2
+ require 'rbbt/bow/bow'
3
+ require 'test/unit'
4
+
5
+ class TestBow < Test::Unit::TestCase
6
+
7
+ def test_words
8
+ assert_equal(["hello", "world"], "Hello World".words)
9
+ end
10
+
11
+ def test_terms
12
+ text = "Hello World"
13
+
14
+ assert_equal(["hello", "world"], BagOfWords.terms(text,false).keys.sort)
15
+ assert_equal(["hello", "hello world", "world"], BagOfWords.terms(text,true).keys.sort)
16
+ end
17
+
18
+ def test_features
19
+ text = "Hello world!"
20
+ text += "Hello World Again!"
21
+
22
+ assert_equal([2, 2], BagOfWords.features(text, "Hello World".words.uniq.sort))
23
+ end
24
+
25
+ def test_stem
26
+ assert_equal(["protein"], "Proteins".words)
27
+ end
28
+ end
29
+
30
+
@@ -0,0 +1,91 @@
1
+ require File.dirname(__FILE__) + '/../../test_helper'
2
+ require 'rbbt/bow/dictionary'
3
+ require 'rbbt/bow/bow'
4
+ require 'test/unit'
5
+
6
+ class TestDictionary < Test::Unit::TestCase
7
+
8
+ def test_standard
9
+ docs = []
10
+ docs << BagOfWords.terms("Hello World", false)
11
+ docs << BagOfWords.terms("Hello Yin Yin", false)
12
+
13
+ dict = Dictionary.new
14
+ docs.each{|doc| dict.add doc}
15
+
16
+ assert_equal(2, dict.terms["hello"])
17
+ assert_equal(2, dict.terms["yin"])
18
+ assert_equal(0, dict.terms["bye"])
19
+ assert_equal(1, dict.terms["world"])
20
+ end
21
+
22
+ def test_tf_idf
23
+ docs = []
24
+ docs << BagOfWords.terms("Hello World", false)
25
+ docs << BagOfWords.terms("Hello Yin Yin", false)
26
+
27
+
28
+ dict = Dictionary::TF_IDF.new
29
+ docs.each{|doc| dict.add doc}
30
+
31
+ assert_equal(2, dict.terms["hello"])
32
+ assert_equal(2, dict.terms["yin"])
33
+ assert_equal(0, dict.terms["bye"])
34
+ assert_equal(1, dict.terms["world"])
35
+
36
+
37
+ assert_equal(1, dict.df["hello"])
38
+ assert_equal(0.5, dict.df["yin"])
39
+ assert_equal(0, dict.df["bye"])
40
+ assert_equal(0.5, dict.df["world"])
41
+
42
+ assert_equal(2.0/5, dict.tf["hello"])
43
+ assert_equal(2.0/5, dict.tf["yin"])
44
+ assert_equal(0, dict.tf["bye"])
45
+ assert_equal(1.0/5, dict.tf["world"])
46
+
47
+ assert_equal(Math::log(1), dict.idf["hello"])
48
+ assert_equal(Math::log(2), dict.idf["yin"])
49
+ assert_equal(0, dict.idf["bye"])
50
+ assert_equal(Math::log(2), dict.idf["world"])
51
+
52
+ assert_equal(2.0/5 * Math::log(1), dict.tf_idf["hello"])
53
+ assert_equal(2.0/5 * Math::log(2), dict.tf_idf["yin"])
54
+ assert_equal(0, dict.tf_idf["bye"])
55
+ assert_equal(1.0/5 * Math::log(2), dict.tf_idf["world"])
56
+ end
57
+
58
+ def test_best
59
+ docs = []
60
+ docs << BagOfWords.terms("Hello World", false)
61
+ docs << BagOfWords.terms("Hello Yin Yin", false)
62
+
63
+
64
+ dict = Dictionary::TF_IDF.new
65
+ docs.each{|doc| dict.add doc}
66
+
67
+ assert_equal(1, dict.best(:limit => 1).length)
68
+ assert(dict.best(:limit => 1).include? "yin")
69
+ end
70
+
71
+ def test_kl
72
+ docs = []
73
+ docs << [BagOfWords.terms("Hello World", false), :+]
74
+ docs << [BagOfWords.terms("Hello Cruel World", false), :+]
75
+ docs << [BagOfWords.terms("Hello Yan Yan", false), :-]
76
+ docs << [BagOfWords.terms("Hello Yin Yin", false), :-]
77
+
78
+
79
+ dict = Dictionary::KL.new
80
+ docs.each{|doc| dict.add *doc}
81
+
82
+ assert_equal(0, dict.kl["hello"])
83
+ assert_equal(dict.kl['yan'], dict.kl['yin'])
84
+ assert_in_delta(1 * Math::log(1 / 0.000001), dict.kl["world"],0.01)
85
+ assert_in_delta(0.5 * Math::log(0.5 / 0.000001), dict.kl["cruel"],0.01)
86
+ end
87
+
88
+
89
+ end
90
+
91
+
@@ -0,0 +1,9 @@
1
+ require File.expand_path(File.dirname(__FILE__) + '/../../test_helper')
2
+ require 'rbbt/bow/misc'
3
+ require 'test/unit'
4
+
5
+ class TestBase < Test::Unit::TestCase
6
+ def test_url
7
+ assert_not_nil($stopwords)
8
+ end
9
+ end
@@ -0,0 +1,32 @@
1
+ require File.dirname(__FILE__) + '/../../test_helper'
2
+ require 'rbbt-util'
3
+ require 'rbbt/ner/regexpNER'
4
+ require 'test/unit'
5
+
6
+ class TestRegExpNER < Test::Unit::TestCase
7
+
8
+ def test_class
9
+ text = "a bc d e f g h i j k l m n o p q one two"
10
+
11
+ lexicon =<<-EOF
12
+ C1,a,x,xx,xxx
13
+ C2,bc,y,yy,yyy
14
+ C3,i,z,zz,zzz,m,one two
15
+ EOF
16
+
17
+ file = TmpFile.tmp_file
18
+ File.open(file, 'w'){|f| f.write lexicon}
19
+
20
+ r = RegExpNER.new(file, :sep => ',', :stopwords => false)
21
+ assert_equal(['a', 'bc', 'i', 'm','one two'].sort, r.match_hash(text).values.flatten.sort)
22
+
23
+ r = RegExpNER.new(file, :sep => ',', :stopwords => true)
24
+ assert_equal(['bc', 'm','one two'].sort,r.match_hash(text).values.flatten.sort)
25
+
26
+
27
+ FileUtils.rm file
28
+ end
29
+
30
+ end
31
+
32
+
@@ -0,0 +1,4 @@
1
+ require 'test/unit'
2
+ $LOAD_PATH.unshift(File.join(File.dirname(__FILE__), '..', 'lib'))
3
+ $LOAD_PATH.unshift(File.dirname(__FILE__))
4
+
metadata ADDED
@@ -0,0 +1,92 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: rbbt-text
3
+ version: !ruby/object:Gem::Version
4
+ hash: 23
5
+ prerelease: false
6
+ segments:
7
+ - 0
8
+ - 0
9
+ - 4
10
+ version: 0.0.4
11
+ platform: ruby
12
+ authors:
13
+ - Miguel Vazquez
14
+ autorequire:
15
+ bindir: bin
16
+ cert_chain: []
17
+
18
+ date: 2010-12-01 00:00:00 +01:00
19
+ default_executable:
20
+ dependencies:
21
+ - !ruby/object:Gem::Dependency
22
+ name: rbbt-util
23
+ prerelease: false
24
+ requirement: &id001 !ruby/object:Gem::Requirement
25
+ none: false
26
+ requirements:
27
+ - - ">="
28
+ - !ruby/object:Gem::Version
29
+ hash: 3
30
+ segments:
31
+ - 0
32
+ version: "0"
33
+ type: :runtime
34
+ version_requirements: *id001
35
+ description: "Text mining tools: named entity recognition and normalization, document classification, bag-of-words, dictionaries, etc"
36
+ email: miguel.vazquez@fdi.ucm.es
37
+ executables: []
38
+
39
+ extensions: []
40
+
41
+ extra_rdoc_files: []
42
+
43
+ files:
44
+ - lib/rbbt/bow/bow.rb
45
+ - lib/rbbt/bow/dictionary.rb
46
+ - lib/rbbt/bow/misc.rb
47
+ - lib/rbbt/ner/regexpNER.rb
48
+ - test/rbbt/bow/test_bow.rb
49
+ - test/rbbt/bow/test_dictionary.rb
50
+ - test/rbbt/bow/test_misc.rb
51
+ - test/rbbt/ner/test_regexpNER.rb
52
+ - test/test_helper.rb
53
+ has_rdoc: true
54
+ homepage: http://github.com/mikisvaz/rbbt-util
55
+ licenses: []
56
+
57
+ post_install_message:
58
+ rdoc_options: []
59
+
60
+ require_paths:
61
+ - lib
62
+ required_ruby_version: !ruby/object:Gem::Requirement
63
+ none: false
64
+ requirements:
65
+ - - ">="
66
+ - !ruby/object:Gem::Version
67
+ hash: 3
68
+ segments:
69
+ - 0
70
+ version: "0"
71
+ required_rubygems_version: !ruby/object:Gem::Requirement
72
+ none: false
73
+ requirements:
74
+ - - ">="
75
+ - !ruby/object:Gem::Version
76
+ hash: 3
77
+ segments:
78
+ - 0
79
+ version: "0"
80
+ requirements: []
81
+
82
+ rubyforge_project:
83
+ rubygems_version: 1.3.7
84
+ signing_key:
85
+ specification_version: 3
86
+ summary: Text mining tools for the Ruby Bioinformatics Toolkit (rbbt)
87
+ test_files:
88
+ - test/rbbt/bow/test_bow.rb
89
+ - test/rbbt/bow/test_dictionary.rb
90
+ - test/rbbt/bow/test_misc.rb
91
+ - test/rbbt/ner/test_regexpNER.rb
92
+ - test/test_helper.rb