classifier-reborn 2.0.3 → 2.0.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -6,15 +6,40 @@ require_relative 'category_namer'
6
6
 
7
7
  module ClassifierReborn
8
8
  class Bayes
9
+ CategoryNotFoundError = Class.new(StandardError)
10
+
9
11
  # The class can be created with one or more categories, each of which will be
10
12
  # initialized and given a training method. E.g.,
11
13
  # b = ClassifierReborn::Bayes.new 'Interesting', 'Uninteresting', 'Spam'
12
- def initialize(*categories)
14
+ #
15
+ # Options available are:
16
+ # language: 'en' Used to select language specific stop words
17
+ # auto_categorize: false When true, enables ability to dynamically declare a category
18
+ # enable_threshold: false When true, enables a threshold requirement for classifition
19
+ # threshold: 0.0 Default threshold, only used when enabled
20
+ def initialize(*args)
13
21
  @categories = Hash.new
14
- categories.each { |category| @categories[CategoryNamer.prepare_name(category)] = Hash.new }
15
- @total_words = 0
16
- @category_counts = Hash.new(0)
17
- @category_word_count = Hash.new
22
+ options = { language: 'en',
23
+ auto_categorize: false,
24
+ enable_threshold: false,
25
+ threshold: 0.0
26
+ }
27
+ args.flatten.each { |arg|
28
+ if arg.kind_of?(Hash)
29
+ options.merge!(arg)
30
+ else
31
+ add_category(arg)
32
+ end
33
+ }
34
+
35
+ @total_words = 0
36
+ @category_counts = Hash.new(0)
37
+ @category_word_count = Hash.new(0)
38
+
39
+ @language = options[:language]
40
+ @auto_categorize = options[:auto_categorize]
41
+ @enable_threshold = options[:enable_threshold]
42
+ @threshold = options[:threshold]
18
43
  end
19
44
 
20
45
  # Provides a general training method for all categories specified in Bayes#new
@@ -25,10 +50,18 @@ module ClassifierReborn
25
50
  # b.train "The other", "The other text"
26
51
  def train(category, text)
27
52
  category = CategoryNamer.prepare_name(category)
28
- @category_word_count[category] ||= 0
53
+
54
+ # Add the category dynamically or raise an error
55
+ if !@categories.has_key?(category)
56
+ if @auto_categorize
57
+ add_category(category)
58
+ else
59
+ raise CategoryNotFoundError.new("Cannot train; category #{category} does not exist")
60
+ end
61
+ end
62
+
29
63
  @category_counts[category] += 1
30
- Hasher.word_hash(text).each do |word, count|
31
- @categories[category][word] ||= 0
64
+ Hasher.word_hash(text, @language).each do |word, count|
32
65
  @categories[category][word] += count
33
66
  @category_word_count[category] += count
34
67
  @total_words += count
@@ -44,12 +77,10 @@ module ClassifierReborn
44
77
  # b.untrain :this, "This text"
45
78
  def untrain(category, text)
46
79
  category = CategoryNamer.prepare_name(category)
47
- @category_word_count[category] ||= 0
48
80
  @category_counts[category] -= 1
49
- Hasher.word_hash(text).each do |word, count|
81
+ Hasher.word_hash(text, @language).each do |word, count|
50
82
  if @total_words >= 0
51
83
  orig = @categories[category][word] || 0
52
- @categories[category][word] ||= 0
53
84
  @categories[category][word] -= count
54
85
  if @categories[category][word] <= 0
55
86
  @categories[category].delete(word)
@@ -70,7 +101,7 @@ module ClassifierReborn
70
101
  # The largest of these scores (the one closest to 0) is the one picked out by #classify
71
102
  def classifications(text)
72
103
  score = Hash.new
73
- word_hash = Hasher.word_hash(text)
104
+ word_hash = Hasher.word_hash(text, @language)
74
105
  training_count = @category_counts.values.reduce(:+).to_f
75
106
  @categories.each do |category, category_words|
76
107
  score[category.to_s] = 0
@@ -87,11 +118,50 @@ module ClassifierReborn
87
118
  end
88
119
 
89
120
  # Returns the classification of the provided +text+, which is one of the
90
- # categories given in the initializer. E.g.,
121
+ # categories given in the initializer along with the score. E.g.,
91
122
  # b.classify "I hate bad words and you"
92
- # => 'Uninteresting'
123
+ # => ['Uninteresting', -4.852030263919617]
124
+ def classify_with_score(text)
125
+ (classifications(text).sort_by { |a| -a[1] })[0]
126
+ end
127
+
128
+ # Return the classification without the score
93
129
  def classify(text)
94
- (classifications(text).sort_by { |a| -a[1] })[0][0]
130
+ result, score = classify_with_score(text)
131
+ if threshold_enabled?
132
+ result = nil if score < @threshold || score == Float::INFINITY
133
+ end
134
+ return result
135
+ end
136
+
137
+ # Retrieve the current threshold value
138
+ def threshold
139
+ @threshold
140
+ end
141
+
142
+ # Dynamically set the threshold value
143
+ def threshold=(a_float)
144
+ @threshold = a_float
145
+ end
146
+
147
+ # Dynamically enable threshold for classify results
148
+ def enable_threshold
149
+ @enable_threshold = true
150
+ end
151
+
152
+ # Dynamically disable threshold for classify results
153
+ def disable_threshold
154
+ @enable_threshold = false
155
+ end
156
+
157
+ # Is threshold processing enabled?
158
+ def threshold_enabled?
159
+ @enable_threshold
160
+ end
161
+
162
+ # is threshold processing disabled?
163
+ def threshold_disabled?
164
+ !@enable_threshold
95
165
  end
96
166
 
97
167
  # Provides training and untraining methods for the categories specified in Bayes#new
@@ -130,7 +200,7 @@ module ClassifierReborn
130
200
  # more criteria than the trained selective categories. In short,
131
201
  # try to initialize your categories at initialization.
132
202
  def add_category(category)
133
- @categories[CategoryNamer.prepare_name(category)] = Hash.new
203
+ @categories[CategoryNamer.prepare_name(category)] ||= Hash.new(0)
134
204
  end
135
205
 
136
206
  alias append_category add_category
@@ -9,7 +9,9 @@ module ClassifierReborn
9
9
  module CategoryNamer
10
10
  extend self
11
11
  def prepare_name(name)
12
- name.to_s.gsub("_"," ").capitalize.intern
12
+ return name if name.is_a?(Symbol)
13
+
14
+ name.to_s.gsub("_"," ").capitalize.intern
13
15
  end
14
16
  end
15
17
  end
@@ -1,39 +1,33 @@
1
+ # encoding: utf-8
1
2
  # Author:: Lucas Carlson (mailto:lucas@rufy.com)
2
3
  # Copyright:: Copyright (c) 2005 Lucas Carlson
3
4
  # License:: LGPL
4
5
 
5
- require "set"
6
+ require 'set'
6
7
 
7
8
  module ClassifierReborn
8
9
  module Hasher
9
- extend self
10
+ STOPWORDS_PATH = [File.expand_path(File.dirname(__FILE__) + '/../../../data/stopwords')]
10
11
 
11
- # Removes common punctuation symbols, returning a new string.
12
- # E.g.,
13
- # "Hello (greeting's), with {braces} < >...?".without_punctuation
14
- # => "Hello greetings with braces "
15
- def without_punctuation(str)
16
- str .tr( ',?.!;:"@#$%^&*()_=+[]{}\|<>/`~', " " ) .tr( "'\-", "")
17
- end
12
+ extend self
18
13
 
19
14
  # Return a Hash of strings => ints. Each word in the string is stemmed,
20
15
  # interned, and indexes to its frequency in the document.
21
- def word_hash(str)
22
- word_hash = clean_word_hash(str)
23
- symbol_hash = word_hash_for_symbols(str.gsub(/[\w]/," ").split)
24
- return clean_word_hash(str).merge(symbol_hash)
16
+ def word_hash(str, language = 'en')
17
+ cleaned_word_hash = clean_word_hash(str, language)
18
+ symbol_hash = word_hash_for_symbols(str.scan(/[^\s\p{WORD}]/))
19
+ return cleaned_word_hash.merge(symbol_hash)
25
20
  end
26
21
 
27
22
  # Return a word hash without extra punctuation or short symbols, just stemmed words
28
- def clean_word_hash(str)
29
- word_hash_for_words str.gsub(/[^\w\s]/,"").split
23
+ def clean_word_hash(str, language = 'en')
24
+ word_hash_for_words str.gsub(/[^\p{WORD}\s]/,'').downcase.split, language
30
25
  end
31
26
 
32
- def word_hash_for_words(words)
27
+ def word_hash_for_words(words, language = 'en')
33
28
  d = Hash.new(0)
34
29
  words.each do |word|
35
- word.downcase!
36
- if ! CORPUS_SKIP_WORDS.include?(word) && word.length > 2
30
+ if word.length > 2 && !STOPWORDS[language].include?(word)
37
31
  d[word.stem.intern] += 1
38
32
  end
39
33
  end
@@ -48,87 +42,18 @@ module ClassifierReborn
48
42
  return d
49
43
  end
50
44
 
51
- CORPUS_SKIP_WORDS = Set.new(%w[
52
- a
53
- again
54
- all
55
- along
56
- are
57
- also
58
- an
59
- and
60
- as
61
- at
62
- but
63
- by
64
- came
65
- can
66
- cant
67
- couldnt
68
- did
69
- didn
70
- didnt
71
- do
72
- doesnt
73
- dont
74
- ever
75
- first
76
- from
77
- have
78
- her
79
- here
80
- him
81
- how
82
- i
83
- if
84
- in
85
- into
86
- is
87
- isnt
88
- it
89
- itll
90
- just
91
- last
92
- least
93
- like
94
- most
95
- my
96
- new
97
- no
98
- not
99
- now
100
- of
101
- on
102
- or
103
- should
104
- sinc
105
- so
106
- some
107
- th
108
- than
109
- this
110
- that
111
- the
112
- their
113
- then
114
- those
115
- to
116
- told
117
- too
118
- true
119
- try
120
- until
121
- url
122
- us
123
- were
124
- when
125
- whether
126
- while
127
- with
128
- within
129
- yes
130
- you
131
- youll
132
- ])
45
+ # Create a lazily-loaded hash of stopword data
46
+ STOPWORDS = Hash.new do |hash, language|
47
+ hash[language] = []
48
+
49
+ STOPWORDS_PATH.each do |path|
50
+ if File.exist?(File.join(path, language))
51
+ hash[language] = Set.new File.read(File.join(path, language.to_s)).split
52
+ break
53
+ end
54
+ end
55
+
56
+ hash[language]
57
+ end
133
58
  end
134
59
  end
@@ -21,7 +21,6 @@ class Matrix
21
21
 
22
22
  qrot = q.dup
23
23
  v = Matrix.identity(q.row_size)
24
- azrot = nil
25
24
  mzrot = nil
26
25
  cnt = 0
27
26
  s_old = nil
@@ -15,6 +15,7 @@ end
15
15
 
16
16
  require_relative 'lsi/word_list'
17
17
  require_relative 'lsi/content_node'
18
+ require_relative 'lsi/cached_content_node'
18
19
  require_relative 'lsi/summarizer'
19
20
 
20
21
  module ClassifierReborn
@@ -24,24 +25,30 @@ module ClassifierReborn
24
25
  # please consult Wikipedia[http://en.wikipedia.org/wiki/Latent_Semantic_Indexing].
25
26
  class LSI
26
27
 
27
- attr_reader :word_list
28
+ attr_reader :word_list, :cache_node_vectors
28
29
  attr_accessor :auto_rebuild
29
30
 
30
31
  # Create a fresh index.
31
32
  # If you want to call #build_index manually, use
32
33
  # ClassifierReborn::LSI.new :auto_rebuild => false
34
+ # If you want to use ContentNodes with cached vector transpositions, use
35
+ # lsi = ClassifierReborn::LSI.new :cache_node_vectors => true
33
36
  #
34
37
  def initialize(options = {})
35
- @auto_rebuild = true unless options[:auto_rebuild] == false
38
+ @auto_rebuild = options[:auto_rebuild] != false
36
39
  @word_list, @items = WordList.new, {}
37
40
  @version, @built_at_version = 0, -1
41
+ @language = options[:language] || 'en'
42
+ if @cache_node_vectors = options[:cache_node_vectors]
43
+ extend CachedContentNode::InstanceMethods
44
+ end
38
45
  end
39
46
 
40
47
  # Returns true if the index needs to be rebuilt. The index needs
41
48
  # to be built after all informaton is added, but before you start
42
49
  # using it for search, classification and cluster detection.
43
50
  def needs_rebuild?
44
- (@items.keys.size > 1) && (@version != @built_at_version)
51
+ (@items.size > 1) && (@version != @built_at_version)
45
52
  end
46
53
 
47
54
  # Adds an item to the index. item is assumed to be a string, but
@@ -58,8 +65,12 @@ module ClassifierReborn
58
65
  # lsi.add_item ar, *ar.categories { |x| ar.content }
59
66
  #
60
67
  def add_item( item, *categories, &block )
61
- clean_word_hash = Hasher.clean_word_hash(block ? block.call(item) : item.to_s)
62
- @items[item] = ContentNode.new(clean_word_hash, *categories)
68
+ clean_word_hash = Hasher.clean_word_hash((block ? block.call(item) : item.to_s), @language)
69
+ @items[item] = if @cache_node_vectors
70
+ CachedContentNode.new(clean_word_hash, *categories)
71
+ else
72
+ ContentNode.new(clean_word_hash, *categories)
73
+ end
63
74
  @version += 1
64
75
  build_index if @auto_rebuild
65
76
  end
@@ -93,13 +104,6 @@ module ClassifierReborn
93
104
  @items.keys
94
105
  end
95
106
 
96
- # Returns the categories for a given indexed items. You are free to add and remove
97
- # items from this as you see fit. It does not invalide an index to change its categories.
98
- def categories_for(item)
99
- return [] unless @items[item]
100
- return @items[item].categories
101
- end
102
-
103
107
  # This function rebuilds the index if needs_rebuild? returns true.
104
108
  # For very large document spaces, this indexing operation may take some
105
109
  # time to complete, so it may be wise to place the operation in another
@@ -155,7 +159,7 @@ module ClassifierReborn
155
159
  return [] if needs_rebuild?
156
160
 
157
161
  avg_density = Hash.new
158
- @items.each_key { |x| avg_density[x] = proximity_array_for_content(x).inject(0.0) { |x,y| x + y[1]} }
162
+ @items.each_key { |item| avg_density[item] = proximity_array_for_content(item).inject(0.0) { |x,y| x + y[1]} }
159
163
 
160
164
  avg_density.keys.sort_by { |x| avg_density[x] }.reverse[0..max_chunks-1].map
161
165
  end
@@ -179,9 +183,9 @@ module ClassifierReborn
179
183
  result =
180
184
  @items.keys.collect do |item|
181
185
  if $GSL
182
- val = content_node.search_vector * @items[item].search_vector.col
186
+ val = content_node.search_vector * @items[item].transposed_search_vector
183
187
  else
184
- val = (Matrix[content_node.search_vector] * @items[item].search_vector)[0]
188
+ val = (Matrix[content_node.search_vector] * @items[item].search_vector)[0]
185
189
  end
186
190
  [item, val]
187
191
  end
@@ -234,35 +238,42 @@ module ClassifierReborn
234
238
  # articles, or find paragraphs that relate to each other in an essay.
235
239
  def find_related( doc, max_nearest=3, &block )
236
240
  carry =
237
- proximity_array_for_content( doc, &block ).reject { |pair| pair[0] == doc }
241
+ proximity_array_for_content( doc, &block ).reject { |pair| pair[0].eql? doc }
238
242
  result = carry.collect { |x| x[0] }
239
243
  return result[0..max_nearest-1]
240
244
  end
241
245
 
246
+ # Return the most obvious category with the score
247
+ def classify_with_score( doc, cutoff=0.30, &block)
248
+ return scored_categories(doc, cutoff, &block).last
249
+ end
250
+
251
+ # Return the most obvious category without the score
252
+ def classify( doc, cutoff=0.30, &block )
253
+ return scored_categories(doc, cutoff, &block).last.first
254
+ end
255
+
242
256
  # This function uses a voting system to categorize documents, based on
243
257
  # the categories of other documents. It uses the same logic as the
244
258
  # find_related function to find related documents, then returns the
245
- # most obvious category from this list.
259
+ # list of sorted categories.
246
260
  #
247
261
  # cutoff signifies the number of documents to consider when clasifying
248
262
  # text. A cutoff of 1 means that every document in the index votes on
249
263
  # what category the document is in. This may not always make sense.
250
264
  #
251
- def classify( doc, cutoff=0.30, &block )
265
+ def scored_categories( doc, cutoff=0.30, &block )
252
266
  icutoff = (@items.size * cutoff).round
253
267
  carry = proximity_array_for_content( doc, &block )
254
268
  carry = carry[0..icutoff-1]
255
- votes = {}
269
+ votes = Hash.new(0.0)
256
270
  carry.each do |pair|
257
- categories = @items[pair[0]].categories
258
- categories.each do |category|
259
- votes[category] ||= 0.0
271
+ @items[pair[0]].categories.each do |category|
260
272
  votes[category] += pair[1]
261
273
  end
262
274
  end
263
275
 
264
- ranking = votes.keys.sort_by { |x| votes[x] }
265
- return ranking[-1]
276
+ return votes.sort_by { |_, score| score }
266
277
  end
267
278
 
268
279
  # Prototype, only works on indexed documents.
@@ -293,7 +304,7 @@ module ClassifierReborn
293
304
  if @items[item]
294
305
  return @items[item]
295
306
  else
296
- clean_word_hash = Hasher.clean_word_hash(block ? block.call(item) : item.to_s)
307
+ clean_word_hash = Hasher.clean_word_hash((block ? block.call(item) : item.to_s), @language)
297
308
 
298
309
  cn = ContentNode.new(clean_word_hash, &block) # make the node and extract the data
299
310