classifier-reborn 2.0.3 → 2.0.4
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/README.markdown +130 -14
- data/data/stopwords/ca +126 -0
- data/data/stopwords/cs +138 -0
- data/data/stopwords/da +101 -0
- data/data/stopwords/de +604 -0
- data/data/stopwords/en +80 -0
- data/data/stopwords/es +351 -0
- data/data/stopwords/fi +747 -0
- data/data/stopwords/fr +463 -0
- data/data/stopwords/hu +35 -0
- data/data/stopwords/it +430 -0
- data/data/stopwords/nl +48 -0
- data/data/stopwords/no +119 -0
- data/data/stopwords/pl +93 -0
- data/data/stopwords/pt +356 -0
- data/data/stopwords/se +386 -0
- data/data/stopwords/tr +114 -0
- data/lib/classifier-reborn/bayes.rb +86 -16
- data/lib/classifier-reborn/category_namer.rb +3 -1
- data/lib/classifier-reborn/extensions/hasher.rb +25 -100
- data/lib/classifier-reborn/extensions/vector.rb +0 -1
- data/lib/classifier-reborn/lsi.rb +36 -25
- data/lib/classifier-reborn/lsi/cached_content_node.rb +48 -0
- data/lib/classifier-reborn/lsi/content_node.rb +27 -10
- data/lib/classifier-reborn/lsi/summarizer.rb +2 -2
- data/lib/classifier-reborn/version.rb +1 -1
- metadata +37 -3
@@ -6,15 +6,40 @@ require_relative 'category_namer'
|
|
6
6
|
|
7
7
|
module ClassifierReborn
|
8
8
|
class Bayes
|
9
|
+
CategoryNotFoundError = Class.new(StandardError)
|
10
|
+
|
9
11
|
# The class can be created with one or more categories, each of which will be
|
10
12
|
# initialized and given a training method. E.g.,
|
11
13
|
# b = ClassifierReborn::Bayes.new 'Interesting', 'Uninteresting', 'Spam'
|
12
|
-
|
14
|
+
#
|
15
|
+
# Options available are:
|
16
|
+
# language: 'en' Used to select language specific stop words
|
17
|
+
# auto_categorize: false When true, enables ability to dynamically declare a category
|
18
|
+
# enable_threshold: false When true, enables a threshold requirement for classifition
|
19
|
+
# threshold: 0.0 Default threshold, only used when enabled
|
20
|
+
def initialize(*args)
|
13
21
|
@categories = Hash.new
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
|
22
|
+
options = { language: 'en',
|
23
|
+
auto_categorize: false,
|
24
|
+
enable_threshold: false,
|
25
|
+
threshold: 0.0
|
26
|
+
}
|
27
|
+
args.flatten.each { |arg|
|
28
|
+
if arg.kind_of?(Hash)
|
29
|
+
options.merge!(arg)
|
30
|
+
else
|
31
|
+
add_category(arg)
|
32
|
+
end
|
33
|
+
}
|
34
|
+
|
35
|
+
@total_words = 0
|
36
|
+
@category_counts = Hash.new(0)
|
37
|
+
@category_word_count = Hash.new(0)
|
38
|
+
|
39
|
+
@language = options[:language]
|
40
|
+
@auto_categorize = options[:auto_categorize]
|
41
|
+
@enable_threshold = options[:enable_threshold]
|
42
|
+
@threshold = options[:threshold]
|
18
43
|
end
|
19
44
|
|
20
45
|
# Provides a general training method for all categories specified in Bayes#new
|
@@ -25,10 +50,18 @@ module ClassifierReborn
|
|
25
50
|
# b.train "The other", "The other text"
|
26
51
|
def train(category, text)
|
27
52
|
category = CategoryNamer.prepare_name(category)
|
28
|
-
|
53
|
+
|
54
|
+
# Add the category dynamically or raise an error
|
55
|
+
if !@categories.has_key?(category)
|
56
|
+
if @auto_categorize
|
57
|
+
add_category(category)
|
58
|
+
else
|
59
|
+
raise CategoryNotFoundError.new("Cannot train; category #{category} does not exist")
|
60
|
+
end
|
61
|
+
end
|
62
|
+
|
29
63
|
@category_counts[category] += 1
|
30
|
-
Hasher.word_hash(text).each do |word, count|
|
31
|
-
@categories[category][word] ||= 0
|
64
|
+
Hasher.word_hash(text, @language).each do |word, count|
|
32
65
|
@categories[category][word] += count
|
33
66
|
@category_word_count[category] += count
|
34
67
|
@total_words += count
|
@@ -44,12 +77,10 @@ module ClassifierReborn
|
|
44
77
|
# b.untrain :this, "This text"
|
45
78
|
def untrain(category, text)
|
46
79
|
category = CategoryNamer.prepare_name(category)
|
47
|
-
@category_word_count[category] ||= 0
|
48
80
|
@category_counts[category] -= 1
|
49
|
-
Hasher.word_hash(text).each do |word, count|
|
81
|
+
Hasher.word_hash(text, @language).each do |word, count|
|
50
82
|
if @total_words >= 0
|
51
83
|
orig = @categories[category][word] || 0
|
52
|
-
@categories[category][word] ||= 0
|
53
84
|
@categories[category][word] -= count
|
54
85
|
if @categories[category][word] <= 0
|
55
86
|
@categories[category].delete(word)
|
@@ -70,7 +101,7 @@ module ClassifierReborn
|
|
70
101
|
# The largest of these scores (the one closest to 0) is the one picked out by #classify
|
71
102
|
def classifications(text)
|
72
103
|
score = Hash.new
|
73
|
-
word_hash = Hasher.word_hash(text)
|
104
|
+
word_hash = Hasher.word_hash(text, @language)
|
74
105
|
training_count = @category_counts.values.reduce(:+).to_f
|
75
106
|
@categories.each do |category, category_words|
|
76
107
|
score[category.to_s] = 0
|
@@ -87,11 +118,50 @@ module ClassifierReborn
|
|
87
118
|
end
|
88
119
|
|
89
120
|
# Returns the classification of the provided +text+, which is one of the
|
90
|
-
# categories given in the initializer. E.g.,
|
121
|
+
# categories given in the initializer along with the score. E.g.,
|
91
122
|
# b.classify "I hate bad words and you"
|
92
|
-
# => 'Uninteresting'
|
123
|
+
# => ['Uninteresting', -4.852030263919617]
|
124
|
+
def classify_with_score(text)
|
125
|
+
(classifications(text).sort_by { |a| -a[1] })[0]
|
126
|
+
end
|
127
|
+
|
128
|
+
# Return the classification without the score
|
93
129
|
def classify(text)
|
94
|
-
|
130
|
+
result, score = classify_with_score(text)
|
131
|
+
if threshold_enabled?
|
132
|
+
result = nil if score < @threshold || score == Float::INFINITY
|
133
|
+
end
|
134
|
+
return result
|
135
|
+
end
|
136
|
+
|
137
|
+
# Retrieve the current threshold value
|
138
|
+
def threshold
|
139
|
+
@threshold
|
140
|
+
end
|
141
|
+
|
142
|
+
# Dynamically set the threshold value
|
143
|
+
def threshold=(a_float)
|
144
|
+
@threshold = a_float
|
145
|
+
end
|
146
|
+
|
147
|
+
# Dynamically enable threshold for classify results
|
148
|
+
def enable_threshold
|
149
|
+
@enable_threshold = true
|
150
|
+
end
|
151
|
+
|
152
|
+
# Dynamically disable threshold for classify results
|
153
|
+
def disable_threshold
|
154
|
+
@enable_threshold = false
|
155
|
+
end
|
156
|
+
|
157
|
+
# Is threshold processing enabled?
|
158
|
+
def threshold_enabled?
|
159
|
+
@enable_threshold
|
160
|
+
end
|
161
|
+
|
162
|
+
# is threshold processing disabled?
|
163
|
+
def threshold_disabled?
|
164
|
+
!@enable_threshold
|
95
165
|
end
|
96
166
|
|
97
167
|
# Provides training and untraining methods for the categories specified in Bayes#new
|
@@ -130,7 +200,7 @@ module ClassifierReborn
|
|
130
200
|
# more criteria than the trained selective categories. In short,
|
131
201
|
# try to initialize your categories at initialization.
|
132
202
|
def add_category(category)
|
133
|
-
@categories[CategoryNamer.prepare_name(category)]
|
203
|
+
@categories[CategoryNamer.prepare_name(category)] ||= Hash.new(0)
|
134
204
|
end
|
135
205
|
|
136
206
|
alias append_category add_category
|
@@ -1,39 +1,33 @@
|
|
1
|
+
# encoding: utf-8
|
1
2
|
# Author:: Lucas Carlson (mailto:lucas@rufy.com)
|
2
3
|
# Copyright:: Copyright (c) 2005 Lucas Carlson
|
3
4
|
# License:: LGPL
|
4
5
|
|
5
|
-
require
|
6
|
+
require 'set'
|
6
7
|
|
7
8
|
module ClassifierReborn
|
8
9
|
module Hasher
|
9
|
-
|
10
|
+
STOPWORDS_PATH = [File.expand_path(File.dirname(__FILE__) + '/../../../data/stopwords')]
|
10
11
|
|
11
|
-
|
12
|
-
# E.g.,
|
13
|
-
# "Hello (greeting's), with {braces} < >...?".without_punctuation
|
14
|
-
# => "Hello greetings with braces "
|
15
|
-
def without_punctuation(str)
|
16
|
-
str .tr( ',?.!;:"@#$%^&*()_=+[]{}\|<>/`~', " " ) .tr( "'\-", "")
|
17
|
-
end
|
12
|
+
extend self
|
18
13
|
|
19
14
|
# Return a Hash of strings => ints. Each word in the string is stemmed,
|
20
15
|
# interned, and indexes to its frequency in the document.
|
21
|
-
def word_hash(str)
|
22
|
-
|
23
|
-
symbol_hash = word_hash_for_symbols(str.
|
24
|
-
return
|
16
|
+
def word_hash(str, language = 'en')
|
17
|
+
cleaned_word_hash = clean_word_hash(str, language)
|
18
|
+
symbol_hash = word_hash_for_symbols(str.scan(/[^\s\p{WORD}]/))
|
19
|
+
return cleaned_word_hash.merge(symbol_hash)
|
25
20
|
end
|
26
21
|
|
27
22
|
# Return a word hash without extra punctuation or short symbols, just stemmed words
|
28
|
-
def clean_word_hash(str)
|
29
|
-
word_hash_for_words str.gsub(/[^\
|
23
|
+
def clean_word_hash(str, language = 'en')
|
24
|
+
word_hash_for_words str.gsub(/[^\p{WORD}\s]/,'').downcase.split, language
|
30
25
|
end
|
31
26
|
|
32
|
-
def word_hash_for_words(words)
|
27
|
+
def word_hash_for_words(words, language = 'en')
|
33
28
|
d = Hash.new(0)
|
34
29
|
words.each do |word|
|
35
|
-
word.
|
36
|
-
if ! CORPUS_SKIP_WORDS.include?(word) && word.length > 2
|
30
|
+
if word.length > 2 && !STOPWORDS[language].include?(word)
|
37
31
|
d[word.stem.intern] += 1
|
38
32
|
end
|
39
33
|
end
|
@@ -48,87 +42,18 @@ module ClassifierReborn
|
|
48
42
|
return d
|
49
43
|
end
|
50
44
|
|
51
|
-
|
52
|
-
|
53
|
-
|
54
|
-
|
55
|
-
|
56
|
-
|
57
|
-
|
58
|
-
|
59
|
-
|
60
|
-
|
61
|
-
|
62
|
-
|
63
|
-
|
64
|
-
came
|
65
|
-
can
|
66
|
-
cant
|
67
|
-
couldnt
|
68
|
-
did
|
69
|
-
didn
|
70
|
-
didnt
|
71
|
-
do
|
72
|
-
doesnt
|
73
|
-
dont
|
74
|
-
ever
|
75
|
-
first
|
76
|
-
from
|
77
|
-
have
|
78
|
-
her
|
79
|
-
here
|
80
|
-
him
|
81
|
-
how
|
82
|
-
i
|
83
|
-
if
|
84
|
-
in
|
85
|
-
into
|
86
|
-
is
|
87
|
-
isnt
|
88
|
-
it
|
89
|
-
itll
|
90
|
-
just
|
91
|
-
last
|
92
|
-
least
|
93
|
-
like
|
94
|
-
most
|
95
|
-
my
|
96
|
-
new
|
97
|
-
no
|
98
|
-
not
|
99
|
-
now
|
100
|
-
of
|
101
|
-
on
|
102
|
-
or
|
103
|
-
should
|
104
|
-
sinc
|
105
|
-
so
|
106
|
-
some
|
107
|
-
th
|
108
|
-
than
|
109
|
-
this
|
110
|
-
that
|
111
|
-
the
|
112
|
-
their
|
113
|
-
then
|
114
|
-
those
|
115
|
-
to
|
116
|
-
told
|
117
|
-
too
|
118
|
-
true
|
119
|
-
try
|
120
|
-
until
|
121
|
-
url
|
122
|
-
us
|
123
|
-
were
|
124
|
-
when
|
125
|
-
whether
|
126
|
-
while
|
127
|
-
with
|
128
|
-
within
|
129
|
-
yes
|
130
|
-
you
|
131
|
-
youll
|
132
|
-
])
|
45
|
+
# Create a lazily-loaded hash of stopword data
|
46
|
+
STOPWORDS = Hash.new do |hash, language|
|
47
|
+
hash[language] = []
|
48
|
+
|
49
|
+
STOPWORDS_PATH.each do |path|
|
50
|
+
if File.exist?(File.join(path, language))
|
51
|
+
hash[language] = Set.new File.read(File.join(path, language.to_s)).split
|
52
|
+
break
|
53
|
+
end
|
54
|
+
end
|
55
|
+
|
56
|
+
hash[language]
|
57
|
+
end
|
133
58
|
end
|
134
59
|
end
|
@@ -15,6 +15,7 @@ end
|
|
15
15
|
|
16
16
|
require_relative 'lsi/word_list'
|
17
17
|
require_relative 'lsi/content_node'
|
18
|
+
require_relative 'lsi/cached_content_node'
|
18
19
|
require_relative 'lsi/summarizer'
|
19
20
|
|
20
21
|
module ClassifierReborn
|
@@ -24,24 +25,30 @@ module ClassifierReborn
|
|
24
25
|
# please consult Wikipedia[http://en.wikipedia.org/wiki/Latent_Semantic_Indexing].
|
25
26
|
class LSI
|
26
27
|
|
27
|
-
attr_reader :word_list
|
28
|
+
attr_reader :word_list, :cache_node_vectors
|
28
29
|
attr_accessor :auto_rebuild
|
29
30
|
|
30
31
|
# Create a fresh index.
|
31
32
|
# If you want to call #build_index manually, use
|
32
33
|
# ClassifierReborn::LSI.new :auto_rebuild => false
|
34
|
+
# If you want to use ContentNodes with cached vector transpositions, use
|
35
|
+
# lsi = ClassifierReborn::LSI.new :cache_node_vectors => true
|
33
36
|
#
|
34
37
|
def initialize(options = {})
|
35
|
-
@auto_rebuild =
|
38
|
+
@auto_rebuild = options[:auto_rebuild] != false
|
36
39
|
@word_list, @items = WordList.new, {}
|
37
40
|
@version, @built_at_version = 0, -1
|
41
|
+
@language = options[:language] || 'en'
|
42
|
+
if @cache_node_vectors = options[:cache_node_vectors]
|
43
|
+
extend CachedContentNode::InstanceMethods
|
44
|
+
end
|
38
45
|
end
|
39
46
|
|
40
47
|
# Returns true if the index needs to be rebuilt. The index needs
|
41
48
|
# to be built after all informaton is added, but before you start
|
42
49
|
# using it for search, classification and cluster detection.
|
43
50
|
def needs_rebuild?
|
44
|
-
(@items.
|
51
|
+
(@items.size > 1) && (@version != @built_at_version)
|
45
52
|
end
|
46
53
|
|
47
54
|
# Adds an item to the index. item is assumed to be a string, but
|
@@ -58,8 +65,12 @@ module ClassifierReborn
|
|
58
65
|
# lsi.add_item ar, *ar.categories { |x| ar.content }
|
59
66
|
#
|
60
67
|
def add_item( item, *categories, &block )
|
61
|
-
clean_word_hash = Hasher.clean_word_hash(block ? block.call(item) : item.to_s)
|
62
|
-
@items[item] =
|
68
|
+
clean_word_hash = Hasher.clean_word_hash((block ? block.call(item) : item.to_s), @language)
|
69
|
+
@items[item] = if @cache_node_vectors
|
70
|
+
CachedContentNode.new(clean_word_hash, *categories)
|
71
|
+
else
|
72
|
+
ContentNode.new(clean_word_hash, *categories)
|
73
|
+
end
|
63
74
|
@version += 1
|
64
75
|
build_index if @auto_rebuild
|
65
76
|
end
|
@@ -93,13 +104,6 @@ module ClassifierReborn
|
|
93
104
|
@items.keys
|
94
105
|
end
|
95
106
|
|
96
|
-
# Returns the categories for a given indexed items. You are free to add and remove
|
97
|
-
# items from this as you see fit. It does not invalide an index to change its categories.
|
98
|
-
def categories_for(item)
|
99
|
-
return [] unless @items[item]
|
100
|
-
return @items[item].categories
|
101
|
-
end
|
102
|
-
|
103
107
|
# This function rebuilds the index if needs_rebuild? returns true.
|
104
108
|
# For very large document spaces, this indexing operation may take some
|
105
109
|
# time to complete, so it may be wise to place the operation in another
|
@@ -155,7 +159,7 @@ module ClassifierReborn
|
|
155
159
|
return [] if needs_rebuild?
|
156
160
|
|
157
161
|
avg_density = Hash.new
|
158
|
-
@items.each_key { |
|
162
|
+
@items.each_key { |item| avg_density[item] = proximity_array_for_content(item).inject(0.0) { |x,y| x + y[1]} }
|
159
163
|
|
160
164
|
avg_density.keys.sort_by { |x| avg_density[x] }.reverse[0..max_chunks-1].map
|
161
165
|
end
|
@@ -179,9 +183,9 @@ module ClassifierReborn
|
|
179
183
|
result =
|
180
184
|
@items.keys.collect do |item|
|
181
185
|
if $GSL
|
182
|
-
|
186
|
+
val = content_node.search_vector * @items[item].transposed_search_vector
|
183
187
|
else
|
184
|
-
|
188
|
+
val = (Matrix[content_node.search_vector] * @items[item].search_vector)[0]
|
185
189
|
end
|
186
190
|
[item, val]
|
187
191
|
end
|
@@ -234,35 +238,42 @@ module ClassifierReborn
|
|
234
238
|
# articles, or find paragraphs that relate to each other in an essay.
|
235
239
|
def find_related( doc, max_nearest=3, &block )
|
236
240
|
carry =
|
237
|
-
proximity_array_for_content( doc, &block ).reject { |pair| pair[0]
|
241
|
+
proximity_array_for_content( doc, &block ).reject { |pair| pair[0].eql? doc }
|
238
242
|
result = carry.collect { |x| x[0] }
|
239
243
|
return result[0..max_nearest-1]
|
240
244
|
end
|
241
245
|
|
246
|
+
# Return the most obvious category with the score
|
247
|
+
def classify_with_score( doc, cutoff=0.30, &block)
|
248
|
+
return scored_categories(doc, cutoff, &block).last
|
249
|
+
end
|
250
|
+
|
251
|
+
# Return the most obvious category without the score
|
252
|
+
def classify( doc, cutoff=0.30, &block )
|
253
|
+
return scored_categories(doc, cutoff, &block).last.first
|
254
|
+
end
|
255
|
+
|
242
256
|
# This function uses a voting system to categorize documents, based on
|
243
257
|
# the categories of other documents. It uses the same logic as the
|
244
258
|
# find_related function to find related documents, then returns the
|
245
|
-
#
|
259
|
+
# list of sorted categories.
|
246
260
|
#
|
247
261
|
# cutoff signifies the number of documents to consider when clasifying
|
248
262
|
# text. A cutoff of 1 means that every document in the index votes on
|
249
263
|
# what category the document is in. This may not always make sense.
|
250
264
|
#
|
251
|
-
def
|
265
|
+
def scored_categories( doc, cutoff=0.30, &block )
|
252
266
|
icutoff = (@items.size * cutoff).round
|
253
267
|
carry = proximity_array_for_content( doc, &block )
|
254
268
|
carry = carry[0..icutoff-1]
|
255
|
-
votes =
|
269
|
+
votes = Hash.new(0.0)
|
256
270
|
carry.each do |pair|
|
257
|
-
|
258
|
-
categories.each do |category|
|
259
|
-
votes[category] ||= 0.0
|
271
|
+
@items[pair[0]].categories.each do |category|
|
260
272
|
votes[category] += pair[1]
|
261
273
|
end
|
262
274
|
end
|
263
275
|
|
264
|
-
|
265
|
-
return ranking[-1]
|
276
|
+
return votes.sort_by { |_, score| score }
|
266
277
|
end
|
267
278
|
|
268
279
|
# Prototype, only works on indexed documents.
|
@@ -293,7 +304,7 @@ module ClassifierReborn
|
|
293
304
|
if @items[item]
|
294
305
|
return @items[item]
|
295
306
|
else
|
296
|
-
clean_word_hash = Hasher.clean_word_hash(block ? block.call(item) : item.to_s)
|
307
|
+
clean_word_hash = Hasher.clean_word_hash((block ? block.call(item) : item.to_s), @language)
|
297
308
|
|
298
309
|
cn = ContentNode.new(clean_word_hash, &block) # make the node and extract the data
|
299
310
|
|