reclassifier 0.0.4 → 0.1.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -6,15 +6,22 @@
6
6
  # Cambridge University Press. 2008, ISBN 0521865719.
7
7
  #
8
8
  class Reclassifier::Bayes
9
+ include Reclassifier::WordHash
10
+
9
11
  # Can be created with zero or more classifications, each of which will be
10
12
  # initialized and given a training method. The classifications are specified as
11
- # symbols. E.g.,
12
- # b = Reclassifier::Bayes.new :interesting, :uninteresting, :spam
13
- def initialize(*classifications)
13
+ # an array of symbols. Options are specified in a hash.
14
+ #
15
+ # Options:
16
+ # * :clean - If false, punctuation will be included in the classifier. Otherwise, punctuation will be omitted. Default is true.
17
+ #
18
+ # b = Reclassifier::Bayes.new([:interesting, :uninteresting, :spam], :clean => true)
19
+ def initialize(classifications = [], options = {})
14
20
  @classifications = {}
15
- classifications.each {|classification| @classifications[classification] = {}}
16
-
17
21
  @docs_in_classification_count = {}
22
+ @options = options
23
+
24
+ classifications.each {|classification| add_classification(classification)}
18
25
  end
19
26
 
20
27
  #
@@ -26,10 +33,9 @@ class Reclassifier::Bayes
26
33
  def train(classification, text)
27
34
  ensure_classification_exists(classification)
28
35
 
29
- @docs_in_classification_count[classification] ||= 0
30
36
  @docs_in_classification_count[classification] += 1
31
37
 
32
- text.word_hash.each do |word, count|
38
+ smart_word_hash(text).each do |word, count|
33
39
  @classifications[classification][word] ||= 0
34
40
 
35
41
  @classifications[classification][word] += count
@@ -49,7 +55,7 @@ class Reclassifier::Bayes
49
55
 
50
56
  @docs_in_classification_count[classification] -= 1
51
57
 
52
- text.word_hash.each do |word, count|
58
+ smart_word_hash(text).each do |word, count|
53
59
  @classifications[classification][word] -= count if @classifications[classification].include?(word)
54
60
  end
55
61
  end
@@ -68,11 +74,11 @@ class Reclassifier::Bayes
68
74
  scores[classification] -= Math.log(@docs_in_classification_count.values.reduce(:+))
69
75
 
70
76
  # likelihood
71
- text.word_hash.each do |word, count|
77
+ smart_word_hash(text).each do |word, count|
72
78
  if @classifications.values.reduce(Set.new) {|set, word_counts| set.merge(word_counts.keys)}.include?(word)
73
79
  scores[classification] += count * Math.log((classification_word_counts[word] || 0) + 1)
74
80
 
75
- scores[classification] -= count * Math.log(classification_word_counts.values.reduce(:+) + @classifications.values.reduce(Set.new) {|set, word_counts| set.merge(word_counts.keys)}.count)
81
+ scores[classification] -= count * Math.log(classification_word_counts.values.reduce(:+).to_i + @classifications.values.reduce(Set.new) {|set, word_counts| set.merge(word_counts.keys)}.count)
76
82
  end
77
83
  end
78
84
  end
@@ -107,6 +113,8 @@ class Reclassifier::Bayes
107
113
  def add_classification(classification)
108
114
  @classifications[classification] ||= {}
109
115
 
116
+ @docs_in_classification_count[classification] ||= 0
117
+
110
118
  classification
111
119
  end
112
120
 
@@ -132,4 +140,12 @@ class Reclassifier::Bayes
132
140
  def ensure_classification_exists(classification)
133
141
  raise Reclassifier::UnknownClassificationError unless @classifications.include?(classification)
134
142
  end
143
+
144
+ def smart_word_hash(string)
145
+ if @options[:clean] == false
146
+ word_hash(string)
147
+ else
148
+ clean_word_hash(string)
149
+ end
150
+ end
135
151
  end
@@ -1,120 +1,4 @@
1
1
  class String
2
-
3
- # Removes common punctuation symbols, returning a new string.
4
- # E.g.,
5
- # "Hello (greeting's), with {braces} < >...?".without_punctuation
6
- # => "Hello greetings with braces "
7
- def without_punctuation
8
- tr( ',?.!;:"@#$%^&*()_=+[]{}\|<>/`~', " " ) .tr( "'\-", "")
9
- end
10
-
11
- # Return a Hash of strings => ints. Each word in the string is stemmed,
12
- # symbolized, and indexed to its frequency in the document.
13
- def word_hash
14
- word_hash_for_words(gsub(/[^\w\s]/,"").split + gsub(/[\w]/," ").split)
15
- end
16
-
17
- # Return a word hash without extra punctuation or short symbols, just stemmed words
18
- def clean_word_hash
19
- word_hash_for_words gsub(/[^\w\s]/,"").split
20
- end
21
-
22
- def word_hash_for_words(words)
23
- d = Hash.new
24
- words.each do |word|
25
- word.downcase! if word =~ /[\w]+/
26
- key = word.stem.to_sym
27
- if word =~ /[^\w]/ || ! CORPUS_SKIP_WORDS.include?(word) && word.length > 2
28
- d[key] ||= 0
29
- d[key] += 1
30
- end
31
- end
32
- return d
33
- end
34
-
35
- CORPUS_SKIP_WORDS = [
36
- "a",
37
- "again",
38
- "all",
39
- "along",
40
- "are",
41
- "also",
42
- "an",
43
- "and",
44
- "as",
45
- "at",
46
- "but",
47
- "by",
48
- "came",
49
- "can",
50
- "cant",
51
- "couldnt",
52
- "did",
53
- "didn",
54
- "didnt",
55
- "do",
56
- "doesnt",
57
- "dont",
58
- "ever",
59
- "first",
60
- "from",
61
- "have",
62
- "her",
63
- "here",
64
- "him",
65
- "how",
66
- "i",
67
- "if",
68
- "in",
69
- "into",
70
- "is",
71
- "isnt",
72
- "it",
73
- "itll",
74
- "just",
75
- "last",
76
- "least",
77
- "like",
78
- "most",
79
- "my",
80
- "new",
81
- "no",
82
- "not",
83
- "now",
84
- "of",
85
- "on",
86
- "or",
87
- "should",
88
- "sinc",
89
- "so",
90
- "some",
91
- "th",
92
- "than",
93
- "this",
94
- "that",
95
- "the",
96
- "their",
97
- "then",
98
- "those",
99
- "to",
100
- "told",
101
- "too",
102
- "true",
103
- "try",
104
- "until",
105
- "url",
106
- "us",
107
- "were",
108
- "when",
109
- "whether",
110
- "while",
111
- "with",
112
- "within",
113
- "yes",
114
- "you",
115
- "youll",
116
- ]
117
-
118
2
  def summary( count=10, separator=" [...] " )
119
3
  perform_lsi split_sentences, count, separator
120
4
  end
@@ -6,6 +6,7 @@ module Reclassifier
6
6
  # data based on underlying semantic relations. For more information on the algorithms used,
7
7
  # please consult Wikipedia[http://en.wikipedia.org/wiki/Latent_Semantic_Indexing].
8
8
  class LSI
9
+ include Reclassifier::WordHash
9
10
 
10
11
  attr_reader :word_list
11
12
  attr_accessor :auto_rebuild
@@ -41,7 +42,7 @@ module Reclassifier
41
42
  # lsi.add_item ar, *ar.categories { |x| ar.content }
42
43
  #
43
44
  def add_item( item, *categories, &block )
44
- clean_word_hash = block ? block.call(item).clean_word_hash : item.to_s.clean_word_hash
45
+ clean_word_hash = block ? clean_word_hash(block.call(item)) : clean_word_hash(item.to_s)
45
46
  @items[item] = ContentNode.new(clean_word_hash, *categories)
46
47
  @version += 1
47
48
  build_index if @auto_rebuild
@@ -276,7 +277,7 @@ module Reclassifier
276
277
  if @items[item]
277
278
  return @items[item]
278
279
  else
279
- clean_word_hash = block ? block.call(item).clean_word_hash : item.to_s.clean_word_hash
280
+ clean_word_hash = block ? clean_word_hash(block.call(item)) : clean_word_hash(item.to_s)
280
281
 
281
282
  cn = ContentNode.new(clean_word_hash, &block) # make the node and extract the data
282
283
 
@@ -1,3 +1,3 @@
1
1
  module Reclassifier
2
- VERSION = "0.0.4"
2
+ VERSION = "0.1.4"
3
3
  end
@@ -0,0 +1,111 @@
1
+ module Reclassifier::WordHash
2
+ CORPUS_SKIP_WORDS = ["a",
3
+ "again",
4
+ "all",
5
+ "along",
6
+ "are",
7
+ "also",
8
+ "an",
9
+ "and",
10
+ "as",
11
+ "at",
12
+ "but",
13
+ "by",
14
+ "came",
15
+ "can",
16
+ "cant",
17
+ "couldnt",
18
+ "did",
19
+ "didn",
20
+ "didnt",
21
+ "do",
22
+ "doesnt",
23
+ "dont",
24
+ "ever",
25
+ "first",
26
+ "from",
27
+ "have",
28
+ "her",
29
+ "here",
30
+ "him",
31
+ "how",
32
+ "i",
33
+ "if",
34
+ "in",
35
+ "into",
36
+ "is",
37
+ "isnt",
38
+ "it",
39
+ "itll",
40
+ "just",
41
+ "last",
42
+ "least",
43
+ "like",
44
+ "most",
45
+ "my",
46
+ "new",
47
+ "no",
48
+ "not",
49
+ "now",
50
+ "of",
51
+ "on",
52
+ "or",
53
+ "should",
54
+ "sinc",
55
+ "so",
56
+ "some",
57
+ "th",
58
+ "than",
59
+ "this",
60
+ "that",
61
+ "the",
62
+ "their",
63
+ "then",
64
+ "those",
65
+ "to",
66
+ "told",
67
+ "too",
68
+ "true",
69
+ "try",
70
+ "until",
71
+ "url",
72
+ "us",
73
+ "were",
74
+ "when",
75
+ "whether",
76
+ "while",
77
+ "with",
78
+ "within",
79
+ "yes",
80
+ "you",
81
+ "youll"]
82
+
83
+ # Return a Hash of strings => ints. Each word in the string is stemmed,
84
+ # symbolized, and indexed to its frequency in the document.
85
+ def word_hash(string)
86
+ word_hash_for_words(string.gsub(/[^\w\s]/,"").split + string.gsub(/[\w]/," ").split)
87
+ end
88
+
89
+ # Return a word hash without extra punctuation or short symbols, just stemmed words
90
+ def clean_word_hash(string)
91
+ word_hash_for_words(string.gsub(/[^\w\s]/,"").split)
92
+ end
93
+
94
+ def word_hash_for_words(words)
95
+ d = {}
96
+
97
+ words.each do |word|
98
+ word.downcase!
99
+
100
+ key = word.stem.to_sym
101
+
102
+ if word =~ /[^\w]/ || !CORPUS_SKIP_WORDS.include?(word) && word.length > 2
103
+ d[key] ||= 0
104
+ d[key] += 1
105
+ end
106
+ end
107
+
108
+ d
109
+ end
110
+
111
+ end
data/lib/reclassifier.rb CHANGED
@@ -12,8 +12,9 @@ require 'gsl/vector'
12
12
 
13
13
  module Reclassifier
14
14
  autoload :Bayes, 'reclassifier/bayes'
15
- autoload :LSI, 'reclassifier/lsi'
16
15
  autoload :ContentNode, 'reclassifier/content_node'
17
- autoload :WordList, 'reclassifier/word_list'
16
+ autoload :LSI, 'reclassifier/lsi'
18
17
  autoload :UnknownClassificationError, 'reclassifier/unknown_classification_error'
18
+ autoload :WordHash, 'reclassifier/word_hash'
19
+ autoload :WordList, 'reclassifier/word_list'
19
20
  end
data/spec/bayes_spec.rb CHANGED
@@ -3,7 +3,7 @@ require 'spec_helper'
3
3
  describe Reclassifier::Bayes do
4
4
  describe "classifications" do
5
5
  it "should return the classifications" do
6
- subject = described_class.new(:interesting, :uninteresting)
6
+ subject = described_class.new([:interesting, :uninteresting])
7
7
 
8
8
  subject.classifications.sort.should eq([:interesting, :uninteresting])
9
9
  end
@@ -15,7 +15,7 @@ describe Reclassifier::Bayes do
15
15
  end
16
16
 
17
17
  it "should train the classifier to the (classification, document) pair" do
18
- subject = described_class.new(:in_china, :not_in_china)
18
+ subject = described_class.new([:in_china, :not_in_china])
19
19
 
20
20
  subject.train(:in_china, 'Chinese Beijing Chinese')
21
21
  subject.train(:in_china, 'Chinese Chinese Shanghai')
@@ -32,7 +32,7 @@ describe Reclassifier::Bayes do
32
32
  end
33
33
 
34
34
  it "should untrain the classifier against the (classification, document) pair" do
35
- subject = described_class.new(:in_china, :not_in_china)
35
+ subject = described_class.new([:in_china, :not_in_china])
36
36
 
37
37
  subject.train(:in_china, 'Chinese Chinese')
38
38
  subject.train(:not_in_china, 'Chinese Macao')
@@ -47,7 +47,7 @@ describe Reclassifier::Bayes do
47
47
 
48
48
  describe "calculate_scores" do
49
49
  it "should return a score hash with the correct scores" do
50
- subject = described_class.new(:in_china, :not_in_china)
50
+ subject = described_class.new([:in_china, :not_in_china])
51
51
 
52
52
  subject.train(:in_china, 'Chinese Beijing Chinese')
53
53
  subject.train(:in_china, 'Chinese Chinese Shanghai')
@@ -59,6 +59,14 @@ describe Reclassifier::Bayes do
59
59
  scores[:in_china].should eq(-8.107690312843907)
60
60
  scores[:not_in_china].should eq(-8.906681345001262)
61
61
  end
62
+
63
+ it "should handle the case when no documents are classified for a particular classification" do
64
+ subject = described_class.new([:in_china, :not_in_china])
65
+
66
+ subject.train(:in_china, 'Chinese Beijing Chinese')
67
+
68
+ subject.calculate_scores('Chinese Beijing')
69
+ end
62
70
  end
63
71
 
64
72
  describe "add_classification" do
@@ -94,4 +102,33 @@ describe Reclassifier::Bayes do
94
102
  subject.remove_classification(:niner).should be(nil)
95
103
  end
96
104
  end
105
+
106
+ context ':clean option' do
107
+ it 'should cause punctuation to be omitted if it is set to true' do
108
+ subject = described_class.new([:one, :other], {:clean => true})
109
+
110
+ subject.train(:one, '! ! ! ! bbb')
111
+ subject.train(:other, 'aaa')
112
+
113
+ subject.classify('! aaa !').should eq(:other)
114
+ end
115
+
116
+ it 'should default to true' do
117
+ subject = described_class.new([:one, :other])
118
+
119
+ subject.train(:one, '! ! ! ! bbb')
120
+ subject.train(:other, 'aaa')
121
+
122
+ subject.classify('! aaa !').should eq(:other)
123
+ end
124
+
125
+ it 'should cause punctuation not to be omitted if it is set to false' do
126
+ subject = described_class.new([:one, :other], {:clean => false})
127
+
128
+ subject.train(:one, '! ! ! ! bbb')
129
+ subject.train(:other, 'aaa')
130
+
131
+ subject.classify('! aaa !').should eq(:one)
132
+ end
133
+ end
97
134
  end
data/spec/lsi_spec.rb CHANGED
@@ -45,7 +45,7 @@ describe Reclassifier::LSI do
45
45
  end
46
46
 
47
47
  it "should perform better than Bayes" do
48
- bayes = Reclassifier::Bayes.new :dog, :cat, :bird
48
+ bayes = Reclassifier::Bayes.new([:dog, :cat, :bird])
49
49
 
50
50
  [[@str1, "Dog"],
51
51
  [@str2, "Dog"],
@@ -1,11 +1,11 @@
1
- require 'spec_helper'
1
+ require "spec_helper"
2
2
 
3
- describe String do
3
+ describe Reclassifier::Bayes do
4
4
  describe "word_hash" do
5
5
  it "should hash text" do
6
6
  hash = {:good => 1, :"!" => 1, :hope => 1, :"'" => 1, :"." => 1, :love => 1, :word => 1, :them => 1, :test => 1}
7
7
 
8
- "here are some good words of test's. I hope you love them!".word_hash.should eq(hash)
8
+ subject.word_hash("here are some good words of test's. I hope you love them!").should eq(hash)
9
9
  end
10
10
  end
11
11
 
@@ -13,7 +13,7 @@ describe String do
13
13
  it "should clean and hash text" do
14
14
  hash = {:good => 1, :word => 1, :hope => 1, :love => 1, :them => 1, :test => 1}
15
15
 
16
- "here are some good words of test's. I hope you love them!".clean_word_hash.should eq(hash)
16
+ subject.clean_word_hash("here are some good words of test's. I hope you love them!").should eq(hash)
17
17
  end
18
18
  end
19
19
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: reclassifier
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.4
4
+ version: 0.1.4
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors:
@@ -114,13 +114,14 @@ files:
114
114
  - lib/reclassifier/lsi.rb
115
115
  - lib/reclassifier/unknown_classification_error.rb
116
116
  - lib/reclassifier/version.rb
117
+ - lib/reclassifier/word_hash.rb
117
118
  - lib/reclassifier/word_list.rb
118
119
  - reclassifier.gemspec
119
120
  - spec/bayes_spec.rb
120
121
  - spec/core_ext/array_spec.rb
121
- - spec/core_ext/string_spec.rb
122
122
  - spec/lsi_spec.rb
123
123
  - spec/spec_helper.rb
124
+ - spec/word_hash_spec.rb
124
125
  homepage: https://github.com/saveup/reclassifier
125
126
  licenses:
126
127
  - LGPL
@@ -149,6 +150,6 @@ summary: Bayesian and Latent Semantic Indexing classification of text.
149
150
  test_files:
150
151
  - spec/bayes_spec.rb
151
152
  - spec/core_ext/array_spec.rb
152
- - spec/core_ext/string_spec.rb
153
153
  - spec/lsi_spec.rb
154
154
  - spec/spec_helper.rb
155
+ - spec/word_hash_spec.rb