classifier-reborn 2.0.3 → 2.0.4

Sign up to get free protection for your applications and to get access to all the features.
@@ -6,15 +6,40 @@ require_relative 'category_namer'
6
6
 
7
7
  module ClassifierReborn
8
8
  class Bayes
9
+ CategoryNotFoundError = Class.new(StandardError)
10
+
9
11
  # The class can be created with one or more categories, each of which will be
10
12
  # initialized and given a training method. E.g.,
11
13
  # b = ClassifierReborn::Bayes.new 'Interesting', 'Uninteresting', 'Spam'
12
- def initialize(*categories)
14
+ #
15
+ # Options available are:
16
+ # language: 'en' Used to select language specific stop words
17
+ # auto_categorize: false When true, enables ability to dynamically declare a category
18
+ # enable_threshold: false When true, enables a threshold requirement for classifition
19
+ # threshold: 0.0 Default threshold, only used when enabled
20
+ def initialize(*args)
13
21
  @categories = Hash.new
14
- categories.each { |category| @categories[CategoryNamer.prepare_name(category)] = Hash.new }
15
- @total_words = 0
16
- @category_counts = Hash.new(0)
17
- @category_word_count = Hash.new
22
+ options = { language: 'en',
23
+ auto_categorize: false,
24
+ enable_threshold: false,
25
+ threshold: 0.0
26
+ }
27
+ args.flatten.each { |arg|
28
+ if arg.kind_of?(Hash)
29
+ options.merge!(arg)
30
+ else
31
+ add_category(arg)
32
+ end
33
+ }
34
+
35
+ @total_words = 0
36
+ @category_counts = Hash.new(0)
37
+ @category_word_count = Hash.new(0)
38
+
39
+ @language = options[:language]
40
+ @auto_categorize = options[:auto_categorize]
41
+ @enable_threshold = options[:enable_threshold]
42
+ @threshold = options[:threshold]
18
43
  end
19
44
 
20
45
  # Provides a general training method for all categories specified in Bayes#new
@@ -25,10 +50,18 @@ module ClassifierReborn
25
50
  # b.train "The other", "The other text"
26
51
  def train(category, text)
27
52
  category = CategoryNamer.prepare_name(category)
28
- @category_word_count[category] ||= 0
53
+
54
+ # Add the category dynamically or raise an error
55
+ if !@categories.has_key?(category)
56
+ if @auto_categorize
57
+ add_category(category)
58
+ else
59
+ raise CategoryNotFoundError.new("Cannot train; category #{category} does not exist")
60
+ end
61
+ end
62
+
29
63
  @category_counts[category] += 1
30
- Hasher.word_hash(text).each do |word, count|
31
- @categories[category][word] ||= 0
64
+ Hasher.word_hash(text, @language).each do |word, count|
32
65
  @categories[category][word] += count
33
66
  @category_word_count[category] += count
34
67
  @total_words += count
@@ -44,12 +77,10 @@ module ClassifierReborn
44
77
  # b.untrain :this, "This text"
45
78
  def untrain(category, text)
46
79
  category = CategoryNamer.prepare_name(category)
47
- @category_word_count[category] ||= 0
48
80
  @category_counts[category] -= 1
49
- Hasher.word_hash(text).each do |word, count|
81
+ Hasher.word_hash(text, @language).each do |word, count|
50
82
  if @total_words >= 0
51
83
  orig = @categories[category][word] || 0
52
- @categories[category][word] ||= 0
53
84
  @categories[category][word] -= count
54
85
  if @categories[category][word] <= 0
55
86
  @categories[category].delete(word)
@@ -70,7 +101,7 @@ module ClassifierReborn
70
101
  # The largest of these scores (the one closest to 0) is the one picked out by #classify
71
102
  def classifications(text)
72
103
  score = Hash.new
73
- word_hash = Hasher.word_hash(text)
104
+ word_hash = Hasher.word_hash(text, @language)
74
105
  training_count = @category_counts.values.reduce(:+).to_f
75
106
  @categories.each do |category, category_words|
76
107
  score[category.to_s] = 0
@@ -87,11 +118,50 @@ module ClassifierReborn
87
118
  end
88
119
 
89
120
  # Returns the classification of the provided +text+, which is one of the
90
- # categories given in the initializer. E.g.,
121
+ # categories given in the initializer along with the score. E.g.,
91
122
  # b.classify "I hate bad words and you"
92
- # => 'Uninteresting'
123
+ # => ['Uninteresting', -4.852030263919617]
124
+ def classify_with_score(text)
125
+ (classifications(text).sort_by { |a| -a[1] })[0]
126
+ end
127
+
128
+ # Return the classification without the score
93
129
  def classify(text)
94
- (classifications(text).sort_by { |a| -a[1] })[0][0]
130
+ result, score = classify_with_score(text)
131
+ if threshold_enabled?
132
+ result = nil if score < @threshold || score == Float::INFINITY
133
+ end
134
+ return result
135
+ end
136
+
137
+ # Retrieve the current threshold value
138
+ def threshold
139
+ @threshold
140
+ end
141
+
142
+ # Dynamically set the threshold value
143
+ def threshold=(a_float)
144
+ @threshold = a_float
145
+ end
146
+
147
+ # Dynamically enable threshold for classify results
148
+ def enable_threshold
149
+ @enable_threshold = true
150
+ end
151
+
152
+ # Dynamically disable threshold for classify results
153
+ def disable_threshold
154
+ @enable_threshold = false
155
+ end
156
+
157
+ # Is threshold processing enabled?
158
+ def threshold_enabled?
159
+ @enable_threshold
160
+ end
161
+
162
+ # is threshold processing disabled?
163
+ def threshold_disabled?
164
+ !@enable_threshold
95
165
  end
96
166
 
97
167
  # Provides training and untraining methods for the categories specified in Bayes#new
@@ -130,7 +200,7 @@ module ClassifierReborn
130
200
  # more criteria than the trained selective categories. In short,
131
201
  # try to initialize your categories at initialization.
132
202
  def add_category(category)
133
- @categories[CategoryNamer.prepare_name(category)] = Hash.new
203
+ @categories[CategoryNamer.prepare_name(category)] ||= Hash.new(0)
134
204
  end
135
205
 
136
206
  alias append_category add_category
@@ -9,7 +9,9 @@ module ClassifierReborn
9
9
  module CategoryNamer
10
10
  extend self
11
11
  def prepare_name(name)
12
- name.to_s.gsub("_"," ").capitalize.intern
12
+ return name if name.is_a?(Symbol)
13
+
14
+ name.to_s.gsub("_"," ").capitalize.intern
13
15
  end
14
16
  end
15
17
  end
@@ -1,39 +1,33 @@
1
+ # encoding: utf-8
1
2
  # Author:: Lucas Carlson (mailto:lucas@rufy.com)
2
3
  # Copyright:: Copyright (c) 2005 Lucas Carlson
3
4
  # License:: LGPL
4
5
 
5
- require "set"
6
+ require 'set'
6
7
 
7
8
  module ClassifierReborn
8
9
  module Hasher
9
- extend self
10
+ STOPWORDS_PATH = [File.expand_path(File.dirname(__FILE__) + '/../../../data/stopwords')]
10
11
 
11
- # Removes common punctuation symbols, returning a new string.
12
- # E.g.,
13
- # "Hello (greeting's), with {braces} < >...?".without_punctuation
14
- # => "Hello greetings with braces "
15
- def without_punctuation(str)
16
- str .tr( ',?.!;:"@#$%^&*()_=+[]{}\|<>/`~', " " ) .tr( "'\-", "")
17
- end
12
+ extend self
18
13
 
19
14
  # Return a Hash of strings => ints. Each word in the string is stemmed,
20
15
  # interned, and indexes to its frequency in the document.
21
- def word_hash(str)
22
- word_hash = clean_word_hash(str)
23
- symbol_hash = word_hash_for_symbols(str.gsub(/[\w]/," ").split)
24
- return clean_word_hash(str).merge(symbol_hash)
16
+ def word_hash(str, language = 'en')
17
+ cleaned_word_hash = clean_word_hash(str, language)
18
+ symbol_hash = word_hash_for_symbols(str.scan(/[^\s\p{WORD}]/))
19
+ return cleaned_word_hash.merge(symbol_hash)
25
20
  end
26
21
 
27
22
  # Return a word hash without extra punctuation or short symbols, just stemmed words
28
- def clean_word_hash(str)
29
- word_hash_for_words str.gsub(/[^\w\s]/,"").split
23
+ def clean_word_hash(str, language = 'en')
24
+ word_hash_for_words str.gsub(/[^\p{WORD}\s]/,'').downcase.split, language
30
25
  end
31
26
 
32
- def word_hash_for_words(words)
27
+ def word_hash_for_words(words, language = 'en')
33
28
  d = Hash.new(0)
34
29
  words.each do |word|
35
- word.downcase!
36
- if ! CORPUS_SKIP_WORDS.include?(word) && word.length > 2
30
+ if word.length > 2 && !STOPWORDS[language].include?(word)
37
31
  d[word.stem.intern] += 1
38
32
  end
39
33
  end
@@ -48,87 +42,18 @@ module ClassifierReborn
48
42
  return d
49
43
  end
50
44
 
51
- CORPUS_SKIP_WORDS = Set.new(%w[
52
- a
53
- again
54
- all
55
- along
56
- are
57
- also
58
- an
59
- and
60
- as
61
- at
62
- but
63
- by
64
- came
65
- can
66
- cant
67
- couldnt
68
- did
69
- didn
70
- didnt
71
- do
72
- doesnt
73
- dont
74
- ever
75
- first
76
- from
77
- have
78
- her
79
- here
80
- him
81
- how
82
- i
83
- if
84
- in
85
- into
86
- is
87
- isnt
88
- it
89
- itll
90
- just
91
- last
92
- least
93
- like
94
- most
95
- my
96
- new
97
- no
98
- not
99
- now
100
- of
101
- on
102
- or
103
- should
104
- sinc
105
- so
106
- some
107
- th
108
- than
109
- this
110
- that
111
- the
112
- their
113
- then
114
- those
115
- to
116
- told
117
- too
118
- true
119
- try
120
- until
121
- url
122
- us
123
- were
124
- when
125
- whether
126
- while
127
- with
128
- within
129
- yes
130
- you
131
- youll
132
- ])
45
+ # Create a lazily-loaded hash of stopword data
46
+ STOPWORDS = Hash.new do |hash, language|
47
+ hash[language] = []
48
+
49
+ STOPWORDS_PATH.each do |path|
50
+ if File.exist?(File.join(path, language))
51
+ hash[language] = Set.new File.read(File.join(path, language.to_s)).split
52
+ break
53
+ end
54
+ end
55
+
56
+ hash[language]
57
+ end
133
58
  end
134
59
  end
@@ -21,7 +21,6 @@ class Matrix
21
21
 
22
22
  qrot = q.dup
23
23
  v = Matrix.identity(q.row_size)
24
- azrot = nil
25
24
  mzrot = nil
26
25
  cnt = 0
27
26
  s_old = nil
@@ -15,6 +15,7 @@ end
15
15
 
16
16
  require_relative 'lsi/word_list'
17
17
  require_relative 'lsi/content_node'
18
+ require_relative 'lsi/cached_content_node'
18
19
  require_relative 'lsi/summarizer'
19
20
 
20
21
  module ClassifierReborn
@@ -24,24 +25,30 @@ module ClassifierReborn
24
25
  # please consult Wikipedia[http://en.wikipedia.org/wiki/Latent_Semantic_Indexing].
25
26
  class LSI
26
27
 
27
- attr_reader :word_list
28
+ attr_reader :word_list, :cache_node_vectors
28
29
  attr_accessor :auto_rebuild
29
30
 
30
31
  # Create a fresh index.
31
32
  # If you want to call #build_index manually, use
32
33
  # ClassifierReborn::LSI.new :auto_rebuild => false
34
+ # If you want to use ContentNodes with cached vector transpositions, use
35
+ # lsi = ClassifierReborn::LSI.new :cache_node_vectors => true
33
36
  #
34
37
  def initialize(options = {})
35
- @auto_rebuild = true unless options[:auto_rebuild] == false
38
+ @auto_rebuild = options[:auto_rebuild] != false
36
39
  @word_list, @items = WordList.new, {}
37
40
  @version, @built_at_version = 0, -1
41
+ @language = options[:language] || 'en'
42
+ if @cache_node_vectors = options[:cache_node_vectors]
43
+ extend CachedContentNode::InstanceMethods
44
+ end
38
45
  end
39
46
 
40
47
  # Returns true if the index needs to be rebuilt. The index needs
41
48
  # to be built after all informaton is added, but before you start
42
49
  # using it for search, classification and cluster detection.
43
50
  def needs_rebuild?
44
- (@items.keys.size > 1) && (@version != @built_at_version)
51
+ (@items.size > 1) && (@version != @built_at_version)
45
52
  end
46
53
 
47
54
  # Adds an item to the index. item is assumed to be a string, but
@@ -58,8 +65,12 @@ module ClassifierReborn
58
65
  # lsi.add_item ar, *ar.categories { |x| ar.content }
59
66
  #
60
67
  def add_item( item, *categories, &block )
61
- clean_word_hash = Hasher.clean_word_hash(block ? block.call(item) : item.to_s)
62
- @items[item] = ContentNode.new(clean_word_hash, *categories)
68
+ clean_word_hash = Hasher.clean_word_hash((block ? block.call(item) : item.to_s), @language)
69
+ @items[item] = if @cache_node_vectors
70
+ CachedContentNode.new(clean_word_hash, *categories)
71
+ else
72
+ ContentNode.new(clean_word_hash, *categories)
73
+ end
63
74
  @version += 1
64
75
  build_index if @auto_rebuild
65
76
  end
@@ -93,13 +104,6 @@ module ClassifierReborn
93
104
  @items.keys
94
105
  end
95
106
 
96
- # Returns the categories for a given indexed items. You are free to add and remove
97
- # items from this as you see fit. It does not invalide an index to change its categories.
98
- def categories_for(item)
99
- return [] unless @items[item]
100
- return @items[item].categories
101
- end
102
-
103
107
  # This function rebuilds the index if needs_rebuild? returns true.
104
108
  # For very large document spaces, this indexing operation may take some
105
109
  # time to complete, so it may be wise to place the operation in another
@@ -155,7 +159,7 @@ module ClassifierReborn
155
159
  return [] if needs_rebuild?
156
160
 
157
161
  avg_density = Hash.new
158
- @items.each_key { |x| avg_density[x] = proximity_array_for_content(x).inject(0.0) { |x,y| x + y[1]} }
162
+ @items.each_key { |item| avg_density[item] = proximity_array_for_content(item).inject(0.0) { |x,y| x + y[1]} }
159
163
 
160
164
  avg_density.keys.sort_by { |x| avg_density[x] }.reverse[0..max_chunks-1].map
161
165
  end
@@ -179,9 +183,9 @@ module ClassifierReborn
179
183
  result =
180
184
  @items.keys.collect do |item|
181
185
  if $GSL
182
- val = content_node.search_vector * @items[item].search_vector.col
186
+ val = content_node.search_vector * @items[item].transposed_search_vector
183
187
  else
184
- val = (Matrix[content_node.search_vector] * @items[item].search_vector)[0]
188
+ val = (Matrix[content_node.search_vector] * @items[item].search_vector)[0]
185
189
  end
186
190
  [item, val]
187
191
  end
@@ -234,35 +238,42 @@ module ClassifierReborn
234
238
  # articles, or find paragraphs that relate to each other in an essay.
235
239
  def find_related( doc, max_nearest=3, &block )
236
240
  carry =
237
- proximity_array_for_content( doc, &block ).reject { |pair| pair[0] == doc }
241
+ proximity_array_for_content( doc, &block ).reject { |pair| pair[0].eql? doc }
238
242
  result = carry.collect { |x| x[0] }
239
243
  return result[0..max_nearest-1]
240
244
  end
241
245
 
246
+ # Return the most obvious category with the score
247
+ def classify_with_score( doc, cutoff=0.30, &block)
248
+ return scored_categories(doc, cutoff, &block).last
249
+ end
250
+
251
+ # Return the most obvious category without the score
252
+ def classify( doc, cutoff=0.30, &block )
253
+ return scored_categories(doc, cutoff, &block).last.first
254
+ end
255
+
242
256
  # This function uses a voting system to categorize documents, based on
243
257
  # the categories of other documents. It uses the same logic as the
244
258
  # find_related function to find related documents, then returns the
245
- # most obvious category from this list.
259
+ # list of sorted categories.
246
260
  #
247
261
  # cutoff signifies the number of documents to consider when clasifying
248
262
  # text. A cutoff of 1 means that every document in the index votes on
249
263
  # what category the document is in. This may not always make sense.
250
264
  #
251
- def classify( doc, cutoff=0.30, &block )
265
+ def scored_categories( doc, cutoff=0.30, &block )
252
266
  icutoff = (@items.size * cutoff).round
253
267
  carry = proximity_array_for_content( doc, &block )
254
268
  carry = carry[0..icutoff-1]
255
- votes = {}
269
+ votes = Hash.new(0.0)
256
270
  carry.each do |pair|
257
- categories = @items[pair[0]].categories
258
- categories.each do |category|
259
- votes[category] ||= 0.0
271
+ @items[pair[0]].categories.each do |category|
260
272
  votes[category] += pair[1]
261
273
  end
262
274
  end
263
275
 
264
- ranking = votes.keys.sort_by { |x| votes[x] }
265
- return ranking[-1]
276
+ return votes.sort_by { |_, score| score }
266
277
  end
267
278
 
268
279
  # Prototype, only works on indexed documents.
@@ -293,7 +304,7 @@ module ClassifierReborn
293
304
  if @items[item]
294
305
  return @items[item]
295
306
  else
296
- clean_word_hash = Hasher.clean_word_hash(block ? block.call(item) : item.to_s)
307
+ clean_word_hash = Hasher.clean_word_hash((block ? block.call(item) : item.to_s), @language)
297
308
 
298
309
  cn = ContentNode.new(clean_word_hash, &block) # make the node and extract the data
299
310