classifier-reborn 2.0.3 → 2.0.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.markdown +130 -14
- data/data/stopwords/ca +126 -0
- data/data/stopwords/cs +138 -0
- data/data/stopwords/da +101 -0
- data/data/stopwords/de +604 -0
- data/data/stopwords/en +80 -0
- data/data/stopwords/es +351 -0
- data/data/stopwords/fi +747 -0
- data/data/stopwords/fr +463 -0
- data/data/stopwords/hu +35 -0
- data/data/stopwords/it +430 -0
- data/data/stopwords/nl +48 -0
- data/data/stopwords/no +119 -0
- data/data/stopwords/pl +93 -0
- data/data/stopwords/pt +356 -0
- data/data/stopwords/se +386 -0
- data/data/stopwords/tr +114 -0
- data/lib/classifier-reborn/bayes.rb +86 -16
- data/lib/classifier-reborn/category_namer.rb +3 -1
- data/lib/classifier-reborn/extensions/hasher.rb +25 -100
- data/lib/classifier-reborn/extensions/vector.rb +0 -1
- data/lib/classifier-reborn/lsi.rb +36 -25
- data/lib/classifier-reborn/lsi/cached_content_node.rb +48 -0
- data/lib/classifier-reborn/lsi/content_node.rb +27 -10
- data/lib/classifier-reborn/lsi/summarizer.rb +2 -2
- data/lib/classifier-reborn/version.rb +1 -1
- metadata +37 -3
@@ -6,15 +6,40 @@ require_relative 'category_namer'
|
|
6
6
|
|
7
7
|
module ClassifierReborn
|
8
8
|
class Bayes
|
9
|
+
CategoryNotFoundError = Class.new(StandardError)
|
10
|
+
|
9
11
|
# The class can be created with one or more categories, each of which will be
|
10
12
|
# initialized and given a training method. E.g.,
|
11
13
|
# b = ClassifierReborn::Bayes.new 'Interesting', 'Uninteresting', 'Spam'
|
12
|
-
|
14
|
+
#
|
15
|
+
# Options available are:
|
16
|
+
# language: 'en' Used to select language specific stop words
|
17
|
+
# auto_categorize: false When true, enables ability to dynamically declare a category
|
18
|
+
# enable_threshold: false When true, enables a threshold requirement for classifition
|
19
|
+
# threshold: 0.0 Default threshold, only used when enabled
|
20
|
+
def initialize(*args)
|
13
21
|
@categories = Hash.new
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
|
22
|
+
options = { language: 'en',
|
23
|
+
auto_categorize: false,
|
24
|
+
enable_threshold: false,
|
25
|
+
threshold: 0.0
|
26
|
+
}
|
27
|
+
args.flatten.each { |arg|
|
28
|
+
if arg.kind_of?(Hash)
|
29
|
+
options.merge!(arg)
|
30
|
+
else
|
31
|
+
add_category(arg)
|
32
|
+
end
|
33
|
+
}
|
34
|
+
|
35
|
+
@total_words = 0
|
36
|
+
@category_counts = Hash.new(0)
|
37
|
+
@category_word_count = Hash.new(0)
|
38
|
+
|
39
|
+
@language = options[:language]
|
40
|
+
@auto_categorize = options[:auto_categorize]
|
41
|
+
@enable_threshold = options[:enable_threshold]
|
42
|
+
@threshold = options[:threshold]
|
18
43
|
end
|
19
44
|
|
20
45
|
# Provides a general training method for all categories specified in Bayes#new
|
@@ -25,10 +50,18 @@ module ClassifierReborn
|
|
25
50
|
# b.train "The other", "The other text"
|
26
51
|
def train(category, text)
|
27
52
|
category = CategoryNamer.prepare_name(category)
|
28
|
-
|
53
|
+
|
54
|
+
# Add the category dynamically or raise an error
|
55
|
+
if !@categories.has_key?(category)
|
56
|
+
if @auto_categorize
|
57
|
+
add_category(category)
|
58
|
+
else
|
59
|
+
raise CategoryNotFoundError.new("Cannot train; category #{category} does not exist")
|
60
|
+
end
|
61
|
+
end
|
62
|
+
|
29
63
|
@category_counts[category] += 1
|
30
|
-
Hasher.word_hash(text).each do |word, count|
|
31
|
-
@categories[category][word] ||= 0
|
64
|
+
Hasher.word_hash(text, @language).each do |word, count|
|
32
65
|
@categories[category][word] += count
|
33
66
|
@category_word_count[category] += count
|
34
67
|
@total_words += count
|
@@ -44,12 +77,10 @@ module ClassifierReborn
|
|
44
77
|
# b.untrain :this, "This text"
|
45
78
|
def untrain(category, text)
|
46
79
|
category = CategoryNamer.prepare_name(category)
|
47
|
-
@category_word_count[category] ||= 0
|
48
80
|
@category_counts[category] -= 1
|
49
|
-
Hasher.word_hash(text).each do |word, count|
|
81
|
+
Hasher.word_hash(text, @language).each do |word, count|
|
50
82
|
if @total_words >= 0
|
51
83
|
orig = @categories[category][word] || 0
|
52
|
-
@categories[category][word] ||= 0
|
53
84
|
@categories[category][word] -= count
|
54
85
|
if @categories[category][word] <= 0
|
55
86
|
@categories[category].delete(word)
|
@@ -70,7 +101,7 @@ module ClassifierReborn
|
|
70
101
|
# The largest of these scores (the one closest to 0) is the one picked out by #classify
|
71
102
|
def classifications(text)
|
72
103
|
score = Hash.new
|
73
|
-
word_hash = Hasher.word_hash(text)
|
104
|
+
word_hash = Hasher.word_hash(text, @language)
|
74
105
|
training_count = @category_counts.values.reduce(:+).to_f
|
75
106
|
@categories.each do |category, category_words|
|
76
107
|
score[category.to_s] = 0
|
@@ -87,11 +118,50 @@ module ClassifierReborn
|
|
87
118
|
end
|
88
119
|
|
89
120
|
# Returns the classification of the provided +text+, which is one of the
|
90
|
-
# categories given in the initializer. E.g.,
|
121
|
+
# categories given in the initializer along with the score. E.g.,
|
91
122
|
# b.classify "I hate bad words and you"
|
92
|
-
# => 'Uninteresting'
|
123
|
+
# => ['Uninteresting', -4.852030263919617]
|
124
|
+
def classify_with_score(text)
|
125
|
+
(classifications(text).sort_by { |a| -a[1] })[0]
|
126
|
+
end
|
127
|
+
|
128
|
+
# Return the classification without the score
|
93
129
|
def classify(text)
|
94
|
-
|
130
|
+
result, score = classify_with_score(text)
|
131
|
+
if threshold_enabled?
|
132
|
+
result = nil if score < @threshold || score == Float::INFINITY
|
133
|
+
end
|
134
|
+
return result
|
135
|
+
end
|
136
|
+
|
137
|
+
# Retrieve the current threshold value
|
138
|
+
def threshold
|
139
|
+
@threshold
|
140
|
+
end
|
141
|
+
|
142
|
+
# Dynamically set the threshold value
|
143
|
+
def threshold=(a_float)
|
144
|
+
@threshold = a_float
|
145
|
+
end
|
146
|
+
|
147
|
+
# Dynamically enable threshold for classify results
|
148
|
+
def enable_threshold
|
149
|
+
@enable_threshold = true
|
150
|
+
end
|
151
|
+
|
152
|
+
# Dynamically disable threshold for classify results
|
153
|
+
def disable_threshold
|
154
|
+
@enable_threshold = false
|
155
|
+
end
|
156
|
+
|
157
|
+
# Is threshold processing enabled?
|
158
|
+
def threshold_enabled?
|
159
|
+
@enable_threshold
|
160
|
+
end
|
161
|
+
|
162
|
+
# is threshold processing disabled?
|
163
|
+
def threshold_disabled?
|
164
|
+
!@enable_threshold
|
95
165
|
end
|
96
166
|
|
97
167
|
# Provides training and untraining methods for the categories specified in Bayes#new
|
@@ -130,7 +200,7 @@ module ClassifierReborn
|
|
130
200
|
# more criteria than the trained selective categories. In short,
|
131
201
|
# try to initialize your categories at initialization.
|
132
202
|
def add_category(category)
|
133
|
-
@categories[CategoryNamer.prepare_name(category)]
|
203
|
+
@categories[CategoryNamer.prepare_name(category)] ||= Hash.new(0)
|
134
204
|
end
|
135
205
|
|
136
206
|
alias append_category add_category
|
@@ -1,39 +1,33 @@
|
|
1
|
+
# encoding: utf-8
|
1
2
|
# Author:: Lucas Carlson (mailto:lucas@rufy.com)
|
2
3
|
# Copyright:: Copyright (c) 2005 Lucas Carlson
|
3
4
|
# License:: LGPL
|
4
5
|
|
5
|
-
require
|
6
|
+
require 'set'
|
6
7
|
|
7
8
|
module ClassifierReborn
|
8
9
|
module Hasher
|
9
|
-
|
10
|
+
STOPWORDS_PATH = [File.expand_path(File.dirname(__FILE__) + '/../../../data/stopwords')]
|
10
11
|
|
11
|
-
|
12
|
-
# E.g.,
|
13
|
-
# "Hello (greeting's), with {braces} < >...?".without_punctuation
|
14
|
-
# => "Hello greetings with braces "
|
15
|
-
def without_punctuation(str)
|
16
|
-
str .tr( ',?.!;:"@#$%^&*()_=+[]{}\|<>/`~', " " ) .tr( "'\-", "")
|
17
|
-
end
|
12
|
+
extend self
|
18
13
|
|
19
14
|
# Return a Hash of strings => ints. Each word in the string is stemmed,
|
20
15
|
# interned, and indexes to its frequency in the document.
|
21
|
-
def word_hash(str)
|
22
|
-
|
23
|
-
symbol_hash = word_hash_for_symbols(str.
|
24
|
-
return
|
16
|
+
def word_hash(str, language = 'en')
|
17
|
+
cleaned_word_hash = clean_word_hash(str, language)
|
18
|
+
symbol_hash = word_hash_for_symbols(str.scan(/[^\s\p{WORD}]/))
|
19
|
+
return cleaned_word_hash.merge(symbol_hash)
|
25
20
|
end
|
26
21
|
|
27
22
|
# Return a word hash without extra punctuation or short symbols, just stemmed words
|
28
|
-
def clean_word_hash(str)
|
29
|
-
word_hash_for_words str.gsub(/[^\
|
23
|
+
def clean_word_hash(str, language = 'en')
|
24
|
+
word_hash_for_words str.gsub(/[^\p{WORD}\s]/,'').downcase.split, language
|
30
25
|
end
|
31
26
|
|
32
|
-
def word_hash_for_words(words)
|
27
|
+
def word_hash_for_words(words, language = 'en')
|
33
28
|
d = Hash.new(0)
|
34
29
|
words.each do |word|
|
35
|
-
word.
|
36
|
-
if ! CORPUS_SKIP_WORDS.include?(word) && word.length > 2
|
30
|
+
if word.length > 2 && !STOPWORDS[language].include?(word)
|
37
31
|
d[word.stem.intern] += 1
|
38
32
|
end
|
39
33
|
end
|
@@ -48,87 +42,18 @@ module ClassifierReborn
|
|
48
42
|
return d
|
49
43
|
end
|
50
44
|
|
51
|
-
|
52
|
-
|
53
|
-
|
54
|
-
|
55
|
-
|
56
|
-
|
57
|
-
|
58
|
-
|
59
|
-
|
60
|
-
|
61
|
-
|
62
|
-
|
63
|
-
|
64
|
-
came
|
65
|
-
can
|
66
|
-
cant
|
67
|
-
couldnt
|
68
|
-
did
|
69
|
-
didn
|
70
|
-
didnt
|
71
|
-
do
|
72
|
-
doesnt
|
73
|
-
dont
|
74
|
-
ever
|
75
|
-
first
|
76
|
-
from
|
77
|
-
have
|
78
|
-
her
|
79
|
-
here
|
80
|
-
him
|
81
|
-
how
|
82
|
-
i
|
83
|
-
if
|
84
|
-
in
|
85
|
-
into
|
86
|
-
is
|
87
|
-
isnt
|
88
|
-
it
|
89
|
-
itll
|
90
|
-
just
|
91
|
-
last
|
92
|
-
least
|
93
|
-
like
|
94
|
-
most
|
95
|
-
my
|
96
|
-
new
|
97
|
-
no
|
98
|
-
not
|
99
|
-
now
|
100
|
-
of
|
101
|
-
on
|
102
|
-
or
|
103
|
-
should
|
104
|
-
sinc
|
105
|
-
so
|
106
|
-
some
|
107
|
-
th
|
108
|
-
than
|
109
|
-
this
|
110
|
-
that
|
111
|
-
the
|
112
|
-
their
|
113
|
-
then
|
114
|
-
those
|
115
|
-
to
|
116
|
-
told
|
117
|
-
too
|
118
|
-
true
|
119
|
-
try
|
120
|
-
until
|
121
|
-
url
|
122
|
-
us
|
123
|
-
were
|
124
|
-
when
|
125
|
-
whether
|
126
|
-
while
|
127
|
-
with
|
128
|
-
within
|
129
|
-
yes
|
130
|
-
you
|
131
|
-
youll
|
132
|
-
])
|
45
|
+
# Create a lazily-loaded hash of stopword data
|
46
|
+
STOPWORDS = Hash.new do |hash, language|
|
47
|
+
hash[language] = []
|
48
|
+
|
49
|
+
STOPWORDS_PATH.each do |path|
|
50
|
+
if File.exist?(File.join(path, language))
|
51
|
+
hash[language] = Set.new File.read(File.join(path, language.to_s)).split
|
52
|
+
break
|
53
|
+
end
|
54
|
+
end
|
55
|
+
|
56
|
+
hash[language]
|
57
|
+
end
|
133
58
|
end
|
134
59
|
end
|
@@ -15,6 +15,7 @@ end
|
|
15
15
|
|
16
16
|
require_relative 'lsi/word_list'
|
17
17
|
require_relative 'lsi/content_node'
|
18
|
+
require_relative 'lsi/cached_content_node'
|
18
19
|
require_relative 'lsi/summarizer'
|
19
20
|
|
20
21
|
module ClassifierReborn
|
@@ -24,24 +25,30 @@ module ClassifierReborn
|
|
24
25
|
# please consult Wikipedia[http://en.wikipedia.org/wiki/Latent_Semantic_Indexing].
|
25
26
|
class LSI
|
26
27
|
|
27
|
-
attr_reader :word_list
|
28
|
+
attr_reader :word_list, :cache_node_vectors
|
28
29
|
attr_accessor :auto_rebuild
|
29
30
|
|
30
31
|
# Create a fresh index.
|
31
32
|
# If you want to call #build_index manually, use
|
32
33
|
# ClassifierReborn::LSI.new :auto_rebuild => false
|
34
|
+
# If you want to use ContentNodes with cached vector transpositions, use
|
35
|
+
# lsi = ClassifierReborn::LSI.new :cache_node_vectors => true
|
33
36
|
#
|
34
37
|
def initialize(options = {})
|
35
|
-
@auto_rebuild =
|
38
|
+
@auto_rebuild = options[:auto_rebuild] != false
|
36
39
|
@word_list, @items = WordList.new, {}
|
37
40
|
@version, @built_at_version = 0, -1
|
41
|
+
@language = options[:language] || 'en'
|
42
|
+
if @cache_node_vectors = options[:cache_node_vectors]
|
43
|
+
extend CachedContentNode::InstanceMethods
|
44
|
+
end
|
38
45
|
end
|
39
46
|
|
40
47
|
# Returns true if the index needs to be rebuilt. The index needs
|
41
48
|
# to be built after all informaton is added, but before you start
|
42
49
|
# using it for search, classification and cluster detection.
|
43
50
|
def needs_rebuild?
|
44
|
-
(@items.
|
51
|
+
(@items.size > 1) && (@version != @built_at_version)
|
45
52
|
end
|
46
53
|
|
47
54
|
# Adds an item to the index. item is assumed to be a string, but
|
@@ -58,8 +65,12 @@ module ClassifierReborn
|
|
58
65
|
# lsi.add_item ar, *ar.categories { |x| ar.content }
|
59
66
|
#
|
60
67
|
def add_item( item, *categories, &block )
|
61
|
-
clean_word_hash = Hasher.clean_word_hash(block ? block.call(item) : item.to_s)
|
62
|
-
@items[item] =
|
68
|
+
clean_word_hash = Hasher.clean_word_hash((block ? block.call(item) : item.to_s), @language)
|
69
|
+
@items[item] = if @cache_node_vectors
|
70
|
+
CachedContentNode.new(clean_word_hash, *categories)
|
71
|
+
else
|
72
|
+
ContentNode.new(clean_word_hash, *categories)
|
73
|
+
end
|
63
74
|
@version += 1
|
64
75
|
build_index if @auto_rebuild
|
65
76
|
end
|
@@ -93,13 +104,6 @@ module ClassifierReborn
|
|
93
104
|
@items.keys
|
94
105
|
end
|
95
106
|
|
96
|
-
# Returns the categories for a given indexed items. You are free to add and remove
|
97
|
-
# items from this as you see fit. It does not invalide an index to change its categories.
|
98
|
-
def categories_for(item)
|
99
|
-
return [] unless @items[item]
|
100
|
-
return @items[item].categories
|
101
|
-
end
|
102
|
-
|
103
107
|
# This function rebuilds the index if needs_rebuild? returns true.
|
104
108
|
# For very large document spaces, this indexing operation may take some
|
105
109
|
# time to complete, so it may be wise to place the operation in another
|
@@ -155,7 +159,7 @@ module ClassifierReborn
|
|
155
159
|
return [] if needs_rebuild?
|
156
160
|
|
157
161
|
avg_density = Hash.new
|
158
|
-
@items.each_key { |
|
162
|
+
@items.each_key { |item| avg_density[item] = proximity_array_for_content(item).inject(0.0) { |x,y| x + y[1]} }
|
159
163
|
|
160
164
|
avg_density.keys.sort_by { |x| avg_density[x] }.reverse[0..max_chunks-1].map
|
161
165
|
end
|
@@ -179,9 +183,9 @@ module ClassifierReborn
|
|
179
183
|
result =
|
180
184
|
@items.keys.collect do |item|
|
181
185
|
if $GSL
|
182
|
-
|
186
|
+
val = content_node.search_vector * @items[item].transposed_search_vector
|
183
187
|
else
|
184
|
-
|
188
|
+
val = (Matrix[content_node.search_vector] * @items[item].search_vector)[0]
|
185
189
|
end
|
186
190
|
[item, val]
|
187
191
|
end
|
@@ -234,35 +238,42 @@ module ClassifierReborn
|
|
234
238
|
# articles, or find paragraphs that relate to each other in an essay.
|
235
239
|
def find_related( doc, max_nearest=3, &block )
|
236
240
|
carry =
|
237
|
-
proximity_array_for_content( doc, &block ).reject { |pair| pair[0]
|
241
|
+
proximity_array_for_content( doc, &block ).reject { |pair| pair[0].eql? doc }
|
238
242
|
result = carry.collect { |x| x[0] }
|
239
243
|
return result[0..max_nearest-1]
|
240
244
|
end
|
241
245
|
|
246
|
+
# Return the most obvious category with the score
|
247
|
+
def classify_with_score( doc, cutoff=0.30, &block)
|
248
|
+
return scored_categories(doc, cutoff, &block).last
|
249
|
+
end
|
250
|
+
|
251
|
+
# Return the most obvious category without the score
|
252
|
+
def classify( doc, cutoff=0.30, &block )
|
253
|
+
return scored_categories(doc, cutoff, &block).last.first
|
254
|
+
end
|
255
|
+
|
242
256
|
# This function uses a voting system to categorize documents, based on
|
243
257
|
# the categories of other documents. It uses the same logic as the
|
244
258
|
# find_related function to find related documents, then returns the
|
245
|
-
#
|
259
|
+
# list of sorted categories.
|
246
260
|
#
|
247
261
|
# cutoff signifies the number of documents to consider when clasifying
|
248
262
|
# text. A cutoff of 1 means that every document in the index votes on
|
249
263
|
# what category the document is in. This may not always make sense.
|
250
264
|
#
|
251
|
-
def
|
265
|
+
def scored_categories( doc, cutoff=0.30, &block )
|
252
266
|
icutoff = (@items.size * cutoff).round
|
253
267
|
carry = proximity_array_for_content( doc, &block )
|
254
268
|
carry = carry[0..icutoff-1]
|
255
|
-
votes =
|
269
|
+
votes = Hash.new(0.0)
|
256
270
|
carry.each do |pair|
|
257
|
-
|
258
|
-
categories.each do |category|
|
259
|
-
votes[category] ||= 0.0
|
271
|
+
@items[pair[0]].categories.each do |category|
|
260
272
|
votes[category] += pair[1]
|
261
273
|
end
|
262
274
|
end
|
263
275
|
|
264
|
-
|
265
|
-
return ranking[-1]
|
276
|
+
return votes.sort_by { |_, score| score }
|
266
277
|
end
|
267
278
|
|
268
279
|
# Prototype, only works on indexed documents.
|
@@ -293,7 +304,7 @@ module ClassifierReborn
|
|
293
304
|
if @items[item]
|
294
305
|
return @items[item]
|
295
306
|
else
|
296
|
-
clean_word_hash = Hasher.clean_word_hash(block ? block.call(item) : item.to_s)
|
307
|
+
clean_word_hash = Hasher.clean_word_hash((block ? block.call(item) : item.to_s), @language)
|
297
308
|
|
298
309
|
cn = ContentNode.new(clean_word_hash, &block) # make the node and extract the data
|
299
310
|
|