rbbt-text 0.0.4

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,87 @@
1
+ require 'rbbt'
2
+ require 'rbbt/bow/misc'
3
+ require 'stemmer'
4
+
5
+ # This module provides methods to extract a bag of words (or bag of bigrams)
6
+ # representation for strings of text, and to produce a vector representations
7
+ # of that bag of words for a given list of terms. This BOW representations of
8
+ # the texts is usually first used to build a Dictionary, and then, with the
9
+ # best selection of terms as determined by the Dictionary::TF_IDF.best of
10
+ # Dictionary::KL.best methods, determine the vector representations for that
11
+ # text.
12
+ module BagOfWords
13
+ # Divide the input string into an array of words (sequences of \w characters).
14
+ # Words are stemmed and filtered to remove stopwords and words with less than
15
+ # 2 characters. The list of stopwords is a global variable defined in
16
+ # 'rbbt/util/misc'.
17
+ def self.words(text)
18
+ return [] if text.nil?
19
+ raise "Stopword list not loaded. Have you installed the wordlists? (rbbt_config prepare wordlists)" if $stopwords.nil?
20
+ text.scan(/\w+/).
21
+ collect{|word| word.downcase.stem}.
22
+ select{|word|
23
+ ! $stopwords.include?(word) &&
24
+ word.length > 2 &&
25
+ word =~ /[a-z]/
26
+ }
27
+ end
28
+
29
+ # Take the array of words for the text and form all the bigrams
30
+ def self.bigrams(text)
31
+ words = words(text)
32
+ bigrams = []
33
+ lastword = nil
34
+
35
+ words.each{|word|
36
+ if lastword
37
+ bigrams << "#{lastword} #{word}"
38
+ end
39
+ lastword = word
40
+ }
41
+
42
+ words + bigrams
43
+ end
44
+
45
+ # Given an array of terms return a hash with the number of appearances of
46
+ # each term
47
+ def self.count(terms)
48
+ count = Hash.new(0)
49
+ terms.each{|word| count[word] += 1}
50
+ count
51
+ end
52
+
53
+
54
+ # Given a string of text find all the words (or bigrams) and return a hash
55
+ # with their counts
56
+ def self.terms(text, bigrams = true)
57
+
58
+ if bigrams
59
+ count(bigrams(text))
60
+ else
61
+ count(words(text))
62
+ end
63
+ end
64
+
65
+ # Given a string of text and a list of terms, which may or may not contain
66
+ # bigrams, return an array with one entry per term which holds the number of
67
+ # occurrences of each term in the text.
68
+ def self.features(text, terms, bigrams = nil)
69
+ bigrams ||= terms.select{|term| term =~ / /}.any?
70
+ count = bigrams ? count(bigrams(text)) : count(words(text))
71
+ count.values_at(*terms)
72
+ end
73
+ end
74
+
75
+ class String
76
+ # Shortcut for BagOfWords.words(self)
77
+ def words
78
+ BagOfWords.words(self)
79
+ end
80
+
81
+ # Shortcut for BagOfWords.bigrams(self)
82
+ def bigrams
83
+ BagOfWords.bigrams(self)
84
+ end
85
+ end
86
+
87
+
@@ -0,0 +1,187 @@
1
+ class Dictionary
2
+ attr_reader :terms
3
+ def initialize
4
+ @terms = Hash.new(0)
5
+ end
6
+
7
+ def add(terms, &block)
8
+ terms.each{|term, count|
9
+ @terms[term] += count
10
+ }
11
+ end
12
+ end
13
+
14
+ class Dictionary::TF_IDF
15
+ attr_reader :terms, :docs, :total_terms, :num_docs
16
+
17
+ def initialize(options = {})
18
+ @term_limit = {
19
+ :limit => 500_000,
20
+ }.merge(options)[:limit]
21
+
22
+ @terms = Hash.new(0)
23
+ @docs = Hash.new(0)
24
+ @num_docs = 0
25
+ @total_terms = 0
26
+ end
27
+
28
+
29
+ def add(terms)
30
+ if @term_limit && @terms.length > @term_limit
31
+ terms = terms.delete_if{|term, count| !@terms.include? term }
32
+ end
33
+
34
+ terms.each{|term, count|
35
+ @terms[term] += count
36
+ @total_terms += count
37
+ @docs[term] += 1
38
+ }
39
+ @num_docs += 1
40
+ end
41
+
42
+ def df
43
+ df = Hash.new(0)
44
+ @docs.each{|term, count|
45
+ df[term] = count.to_f / @num_docs
46
+ }
47
+ df
48
+ end
49
+
50
+ def tf
51
+ tf = Hash.new(0)
52
+ @terms.each{|term, count|
53
+ tf[term] = count.to_f / @total_terms
54
+ }
55
+ tf
56
+ end
57
+
58
+ def idf
59
+ idf = Hash.new(0)
60
+ num_docs = @num_docs.to_f
61
+ @docs.each{|term, count|
62
+ idf[term] = Math::log(num_docs / count)
63
+ }
64
+ idf
65
+ end
66
+
67
+ def tf_idf
68
+ tf_idf = Hash.new(0)
69
+ num_docs = @num_docs.to_f
70
+ @docs.each{|term, count|
71
+ tf_idf[term] = @terms[term].to_f / @total_terms * Math::log(num_docs / count)
72
+ }
73
+ tf_idf
74
+ end
75
+
76
+ def best(options = {})
77
+ hi, low, limit = {
78
+ :low => 0,
79
+ :hi => 1,
80
+ }.merge(options).
81
+ values_at(:hi, :low, :limit)
82
+
83
+ num_docs = @num_docs.to_f
84
+ best = df.select{|term, value|
85
+ value >= low && value <= hi
86
+ }.collect{|p|
87
+ term = p.first
88
+ df_value = p.last
89
+ [term,
90
+ @terms[term].to_f / num_docs * Math::log(1.0/df_value)
91
+ ]
92
+ }
93
+ if limit
94
+ Hash[*best.sort{|a,b| b[1] <=> a[1]}.slice(0, limit).flatten]
95
+ else
96
+ Hash[*best.flatten]
97
+ end
98
+ end
99
+
100
+ def weights(options = {})
101
+ best_terms = best(options).keys
102
+ weights = {}
103
+
104
+ num_docs = @num_docs.to_f
105
+ best_terms.each{|term|
106
+ weights[term] = Math::log(num_docs / @docs[term])
107
+ }
108
+ weights
109
+ end
110
+
111
+ end
112
+
113
+ class Dictionary::KL
114
+ attr_reader :pos_dict, :neg_dict
115
+
116
+ def initialize(options = {})
117
+ @pos_dict = Dictionary::TF_IDF.new(options)
118
+ @neg_dict = Dictionary::TF_IDF.new(options)
119
+ end
120
+
121
+ def terms
122
+ (pos_dict.terms.keys + neg_dict.terms.keys).uniq
123
+ end
124
+
125
+ def add(terms, c)
126
+ dict = (c == :+ || c == '+' ? @pos_dict : @neg_dict)
127
+ dict.add(terms)
128
+ end
129
+
130
+ def kl
131
+ kl = {}
132
+ pos_df = @pos_dict.df
133
+ neg_df = @neg_dict.df
134
+
135
+ terms.each{|term|
136
+ pos = pos_df[term]
137
+ neg = neg_df[term]
138
+
139
+ pos = 0.000001 if pos == 0
140
+ pos = 0.999999 if pos == 1
141
+ neg = 0.000001 if neg == 0
142
+ neg = 0.999999 if neg == 1
143
+
144
+ kl[term] = pos * Math::log(pos / neg) + neg * Math::log(neg / pos)
145
+ }
146
+ kl
147
+ end
148
+
149
+ def best(options = {})
150
+ hi, low, limit = {
151
+ :low => 0,
152
+ :hi => 1,
153
+ }.merge(options).
154
+ values_at(:hi, :low, :limit)
155
+
156
+ pos_df = @pos_dict.df
157
+ neg_df = @neg_dict.df
158
+
159
+ best = {}
160
+ terms.select{|term|
161
+ pos_df[term] >= low && pos_df[term] <= hi ||
162
+ neg_df[term] >= low && neg_df[term] <= hi
163
+ }.each{|term|
164
+ pos = pos_df[term]
165
+ neg = neg_df[term]
166
+
167
+ pos = 0.000001 if pos == 0
168
+ pos = 0.999999 if pos == 1
169
+ neg = 0.000001 if neg == 0
170
+ neg = 0.999999 if neg == 1
171
+
172
+ best[term] = pos * Math::log(pos / neg) + neg * Math::log(neg / pos)
173
+ }
174
+ if limit
175
+ Hash[*best.sort{|a,b| b[1] <=> a[1]}.slice(0, limit).flatten]
176
+ else
177
+ Hash[*best.flatten]
178
+ end
179
+ end
180
+
181
+ def weights(options = {})
182
+ best(options)
183
+ end
184
+
185
+
186
+
187
+ end
@@ -0,0 +1,7 @@
1
+ require 'rbbt'
2
+ require 'rbbt/util/open'
3
+
4
+ Rbbt.add_datafiles 'stopwords' => ['wordlists', 'stopwords']
5
+
6
+ $stopwords = Open.read(Rbbt.find_datafile 'stopwords').scan(/\w+/) if File.exists?(Rbbt.find_datafile 'stopwords')
7
+
@@ -0,0 +1,61 @@
1
+ require 'rbbt-util'
2
+ require 'rbbt/bow/misc'
3
+
4
+ class RegExpNER
5
+
6
+ def self.build_re(names, ignorecase=true)
7
+ res = names.compact.reject{|n| n.empty?}.
8
+ sort_by{|a| a.length}.reverse.collect{|n| Regexp.quote(n) }
9
+
10
+ /\b(#{ res.join("|").gsub(/\\?\s/,'\s+') })\b/
11
+ end
12
+
13
+ def initialize(lexicon, options = {})
14
+ options = Misc.add_defaults options, :flatten => true, :case_insensitive => true, :stopwords => nil
15
+
16
+ if $stopwords and (options[:stopwords].nil? || options[:stopwords] == true)
17
+ options[:stopwords] = $stopwords
18
+ else
19
+ options[:stopwords] = []
20
+ end
21
+
22
+ data = TSV.new(lexicon, options)
23
+
24
+ @index = {}
25
+ data.collect{|code, names|
26
+ next if code.nil? || code == ""
27
+ if options[:stopwords].any?
28
+ names = names.select{|n|
29
+ ! options[:stopwords].include?(options[:case_insensitive] ? n.downcase : n)
30
+ }
31
+ end
32
+ @index[code] = RegExpNER.build_re(names, options[:case_insensitive])
33
+ }
34
+ end
35
+
36
+ def self.match_re(text, res)
37
+ res = [res] unless Array === res
38
+
39
+ res.collect{|re|
40
+ text.scan(re)
41
+ }.flatten
42
+ end
43
+
44
+ def match_hash(text)
45
+ return {} if text.nil? or text.empty?
46
+ matches = {}
47
+ @index.each{|code, re|
48
+ RegExpNER.match_re(text, re).each{|match|
49
+ matches[code] ||= []
50
+ matches[code] << match
51
+ }
52
+ }
53
+ matches
54
+ end
55
+
56
+ def match(text)
57
+ match_hash(text)
58
+ end
59
+
60
+ end
61
+
@@ -0,0 +1,30 @@
1
+ require File.expand_path(File.dirname(__FILE__) + '/../../test_helper')
2
+ require 'rbbt/bow/bow'
3
+ require 'test/unit'
4
+
5
+ class TestBow < Test::Unit::TestCase
6
+
7
+ def test_words
8
+ assert_equal(["hello", "world"], "Hello World".words)
9
+ end
10
+
11
+ def test_terms
12
+ text = "Hello World"
13
+
14
+ assert_equal(["hello", "world"], BagOfWords.terms(text,false).keys.sort)
15
+ assert_equal(["hello", "hello world", "world"], BagOfWords.terms(text,true).keys.sort)
16
+ end
17
+
18
+ def test_features
19
+ text = "Hello world!"
20
+ text += "Hello World Again!"
21
+
22
+ assert_equal([2, 2], BagOfWords.features(text, "Hello World".words.uniq.sort))
23
+ end
24
+
25
+ def test_stem
26
+ assert_equal(["protein"], "Proteins".words)
27
+ end
28
+ end
29
+
30
+
@@ -0,0 +1,91 @@
1
+ require File.dirname(__FILE__) + '/../../test_helper'
2
+ require 'rbbt/bow/dictionary'
3
+ require 'rbbt/bow/bow'
4
+ require 'test/unit'
5
+
6
+ class TestDictionary < Test::Unit::TestCase
7
+
8
+ def test_standard
9
+ docs = []
10
+ docs << BagOfWords.terms("Hello World", false)
11
+ docs << BagOfWords.terms("Hello Yin Yin", false)
12
+
13
+ dict = Dictionary.new
14
+ docs.each{|doc| dict.add doc}
15
+
16
+ assert_equal(2, dict.terms["hello"])
17
+ assert_equal(2, dict.terms["yin"])
18
+ assert_equal(0, dict.terms["bye"])
19
+ assert_equal(1, dict.terms["world"])
20
+ end
21
+
22
+ def test_tf_idf
23
+ docs = []
24
+ docs << BagOfWords.terms("Hello World", false)
25
+ docs << BagOfWords.terms("Hello Yin Yin", false)
26
+
27
+
28
+ dict = Dictionary::TF_IDF.new
29
+ docs.each{|doc| dict.add doc}
30
+
31
+ assert_equal(2, dict.terms["hello"])
32
+ assert_equal(2, dict.terms["yin"])
33
+ assert_equal(0, dict.terms["bye"])
34
+ assert_equal(1, dict.terms["world"])
35
+
36
+
37
+ assert_equal(1, dict.df["hello"])
38
+ assert_equal(0.5, dict.df["yin"])
39
+ assert_equal(0, dict.df["bye"])
40
+ assert_equal(0.5, dict.df["world"])
41
+
42
+ assert_equal(2.0/5, dict.tf["hello"])
43
+ assert_equal(2.0/5, dict.tf["yin"])
44
+ assert_equal(0, dict.tf["bye"])
45
+ assert_equal(1.0/5, dict.tf["world"])
46
+
47
+ assert_equal(Math::log(1), dict.idf["hello"])
48
+ assert_equal(Math::log(2), dict.idf["yin"])
49
+ assert_equal(0, dict.idf["bye"])
50
+ assert_equal(Math::log(2), dict.idf["world"])
51
+
52
+ assert_equal(2.0/5 * Math::log(1), dict.tf_idf["hello"])
53
+ assert_equal(2.0/5 * Math::log(2), dict.tf_idf["yin"])
54
+ assert_equal(0, dict.tf_idf["bye"])
55
+ assert_equal(1.0/5 * Math::log(2), dict.tf_idf["world"])
56
+ end
57
+
58
+ def test_best
59
+ docs = []
60
+ docs << BagOfWords.terms("Hello World", false)
61
+ docs << BagOfWords.terms("Hello Yin Yin", false)
62
+
63
+
64
+ dict = Dictionary::TF_IDF.new
65
+ docs.each{|doc| dict.add doc}
66
+
67
+ assert_equal(1, dict.best(:limit => 1).length)
68
+ assert(dict.best(:limit => 1).include? "yin")
69
+ end
70
+
71
+ def test_kl
72
+ docs = []
73
+ docs << [BagOfWords.terms("Hello World", false), :+]
74
+ docs << [BagOfWords.terms("Hello Cruel World", false), :+]
75
+ docs << [BagOfWords.terms("Hello Yan Yan", false), :-]
76
+ docs << [BagOfWords.terms("Hello Yin Yin", false), :-]
77
+
78
+
79
+ dict = Dictionary::KL.new
80
+ docs.each{|doc| dict.add *doc}
81
+
82
+ assert_equal(0, dict.kl["hello"])
83
+ assert_equal(dict.kl['yan'], dict.kl['yin'])
84
+ assert_in_delta(1 * Math::log(1 / 0.000001), dict.kl["world"],0.01)
85
+ assert_in_delta(0.5 * Math::log(0.5 / 0.000001), dict.kl["cruel"],0.01)
86
+ end
87
+
88
+
89
+ end
90
+
91
+
@@ -0,0 +1,9 @@
1
+ require File.expand_path(File.dirname(__FILE__) + '/../../test_helper')
2
+ require 'rbbt/bow/misc'
3
+ require 'test/unit'
4
+
5
+ class TestBase < Test::Unit::TestCase
6
+ def test_url
7
+ assert_not_nil($stopwords)
8
+ end
9
+ end
@@ -0,0 +1,32 @@
1
+ require File.dirname(__FILE__) + '/../../test_helper'
2
+ require 'rbbt-util'
3
+ require 'rbbt/ner/regexpNER'
4
+ require 'test/unit'
5
+
6
+ class TestRegExpNER < Test::Unit::TestCase
7
+
8
+ def test_class
9
+ text = "a bc d e f g h i j k l m n o p q one two"
10
+
11
+ lexicon =<<-EOF
12
+ C1,a,x,xx,xxx
13
+ C2,bc,y,yy,yyy
14
+ C3,i,z,zz,zzz,m,one two
15
+ EOF
16
+
17
+ file = TmpFile.tmp_file
18
+ File.open(file, 'w'){|f| f.write lexicon}
19
+
20
+ r = RegExpNER.new(file, :sep => ',', :stopwords => false)
21
+ assert_equal(['a', 'bc', 'i', 'm','one two'].sort, r.match_hash(text).values.flatten.sort)
22
+
23
+ r = RegExpNER.new(file, :sep => ',', :stopwords => true)
24
+ assert_equal(['bc', 'm','one two'].sort,r.match_hash(text).values.flatten.sort)
25
+
26
+
27
+ FileUtils.rm file
28
+ end
29
+
30
+ end
31
+
32
+
@@ -0,0 +1,4 @@
1
+ require 'test/unit'
2
+ $LOAD_PATH.unshift(File.join(File.dirname(__FILE__), '..', 'lib'))
3
+ $LOAD_PATH.unshift(File.dirname(__FILE__))
4
+
metadata ADDED
@@ -0,0 +1,92 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: rbbt-text
3
+ version: !ruby/object:Gem::Version
4
+ hash: 23
5
+ prerelease: false
6
+ segments:
7
+ - 0
8
+ - 0
9
+ - 4
10
+ version: 0.0.4
11
+ platform: ruby
12
+ authors:
13
+ - Miguel Vazquez
14
+ autorequire:
15
+ bindir: bin
16
+ cert_chain: []
17
+
18
+ date: 2010-12-01 00:00:00 +01:00
19
+ default_executable:
20
+ dependencies:
21
+ - !ruby/object:Gem::Dependency
22
+ name: rbbt-util
23
+ prerelease: false
24
+ requirement: &id001 !ruby/object:Gem::Requirement
25
+ none: false
26
+ requirements:
27
+ - - ">="
28
+ - !ruby/object:Gem::Version
29
+ hash: 3
30
+ segments:
31
+ - 0
32
+ version: "0"
33
+ type: :runtime
34
+ version_requirements: *id001
35
+ description: "Text mining tools: named entity recognition and normalization, document classification, bag-of-words, dictionaries, etc"
36
+ email: miguel.vazquez@fdi.ucm.es
37
+ executables: []
38
+
39
+ extensions: []
40
+
41
+ extra_rdoc_files: []
42
+
43
+ files:
44
+ - lib/rbbt/bow/bow.rb
45
+ - lib/rbbt/bow/dictionary.rb
46
+ - lib/rbbt/bow/misc.rb
47
+ - lib/rbbt/ner/regexpNER.rb
48
+ - test/rbbt/bow/test_bow.rb
49
+ - test/rbbt/bow/test_dictionary.rb
50
+ - test/rbbt/bow/test_misc.rb
51
+ - test/rbbt/ner/test_regexpNER.rb
52
+ - test/test_helper.rb
53
+ has_rdoc: true
54
+ homepage: http://github.com/mikisvaz/rbbt-util
55
+ licenses: []
56
+
57
+ post_install_message:
58
+ rdoc_options: []
59
+
60
+ require_paths:
61
+ - lib
62
+ required_ruby_version: !ruby/object:Gem::Requirement
63
+ none: false
64
+ requirements:
65
+ - - ">="
66
+ - !ruby/object:Gem::Version
67
+ hash: 3
68
+ segments:
69
+ - 0
70
+ version: "0"
71
+ required_rubygems_version: !ruby/object:Gem::Requirement
72
+ none: false
73
+ requirements:
74
+ - - ">="
75
+ - !ruby/object:Gem::Version
76
+ hash: 3
77
+ segments:
78
+ - 0
79
+ version: "0"
80
+ requirements: []
81
+
82
+ rubyforge_project:
83
+ rubygems_version: 1.3.7
84
+ signing_key:
85
+ specification_version: 3
86
+ summary: Text mining tools for the Ruby Bioinformatics Toolkit (rbbt)
87
+ test_files:
88
+ - test/rbbt/bow/test_bow.rb
89
+ - test/rbbt/bow/test_dictionary.rb
90
+ - test/rbbt/bow/test_misc.rb
91
+ - test/rbbt/ner/test_regexpNER.rb
92
+ - test/test_helper.rb