reclassifier 0.0.4 → 0.1.4

Sign up to get free protection for your applications and to get access to all the features.
@@ -6,15 +6,22 @@
6
6
  # Cambridge University Press. 2008, ISBN 0521865719.
7
7
  #
8
8
  class Reclassifier::Bayes
9
+ include Reclassifier::WordHash
10
+
9
11
  # Can be created with zero or more classifications, each of which will be
10
12
  # initialized and given a training method. The classifications are specified as
11
- # symbols. E.g.,
12
- # b = Reclassifier::Bayes.new :interesting, :uninteresting, :spam
13
- def initialize(*classifications)
13
+ # an array of symbols. Options are specified in a hash.
14
+ #
15
+ # Options:
16
+ # * :clean - If false, punctuation will be included in the classifier. Otherwise, punctuation will be omitted. Default is true.
17
+ #
18
+ # b = Reclassifier::Bayes.new([:interesting, :uninteresting, :spam], :clean => true)
19
+ def initialize(classifications = [], options = {})
14
20
  @classifications = {}
15
- classifications.each {|classification| @classifications[classification] = {}}
16
-
17
21
  @docs_in_classification_count = {}
22
+ @options = options
23
+
24
+ classifications.each {|classification| add_classification(classification)}
18
25
  end
19
26
 
20
27
  #
@@ -26,10 +33,9 @@ class Reclassifier::Bayes
26
33
  def train(classification, text)
27
34
  ensure_classification_exists(classification)
28
35
 
29
- @docs_in_classification_count[classification] ||= 0
30
36
  @docs_in_classification_count[classification] += 1
31
37
 
32
- text.word_hash.each do |word, count|
38
+ smart_word_hash(text).each do |word, count|
33
39
  @classifications[classification][word] ||= 0
34
40
 
35
41
  @classifications[classification][word] += count
@@ -49,7 +55,7 @@ class Reclassifier::Bayes
49
55
 
50
56
  @docs_in_classification_count[classification] -= 1
51
57
 
52
- text.word_hash.each do |word, count|
58
+ smart_word_hash(text).each do |word, count|
53
59
  @classifications[classification][word] -= count if @classifications[classification].include?(word)
54
60
  end
55
61
  end
@@ -68,11 +74,11 @@ class Reclassifier::Bayes
68
74
  scores[classification] -= Math.log(@docs_in_classification_count.values.reduce(:+))
69
75
 
70
76
  # likelihood
71
- text.word_hash.each do |word, count|
77
+ smart_word_hash(text).each do |word, count|
72
78
  if @classifications.values.reduce(Set.new) {|set, word_counts| set.merge(word_counts.keys)}.include?(word)
73
79
  scores[classification] += count * Math.log((classification_word_counts[word] || 0) + 1)
74
80
 
75
- scores[classification] -= count * Math.log(classification_word_counts.values.reduce(:+) + @classifications.values.reduce(Set.new) {|set, word_counts| set.merge(word_counts.keys)}.count)
81
+ scores[classification] -= count * Math.log(classification_word_counts.values.reduce(:+).to_i + @classifications.values.reduce(Set.new) {|set, word_counts| set.merge(word_counts.keys)}.count)
76
82
  end
77
83
  end
78
84
  end
@@ -107,6 +113,8 @@ class Reclassifier::Bayes
107
113
  def add_classification(classification)
108
114
  @classifications[classification] ||= {}
109
115
 
116
+ @docs_in_classification_count[classification] ||= 0
117
+
110
118
  classification
111
119
  end
112
120
 
@@ -132,4 +140,12 @@ class Reclassifier::Bayes
132
140
  def ensure_classification_exists(classification)
133
141
  raise Reclassifier::UnknownClassificationError unless @classifications.include?(classification)
134
142
  end
143
+
144
+ def smart_word_hash(string)
145
+ if @options[:clean] == false
146
+ word_hash(string)
147
+ else
148
+ clean_word_hash(string)
149
+ end
150
+ end
135
151
  end
@@ -1,120 +1,4 @@
1
1
  class String
2
-
3
- # Removes common punctuation symbols, returning a new string.
4
- # E.g.,
5
- # "Hello (greeting's), with {braces} < >...?".without_punctuation
6
- # => "Hello greetings with braces "
7
- def without_punctuation
8
- tr( ',?.!;:"@#$%^&*()_=+[]{}\|<>/`~', " " ) .tr( "'\-", "")
9
- end
10
-
11
- # Return a Hash of strings => ints. Each word in the string is stemmed,
12
- # symbolized, and indexed to its frequency in the document.
13
- def word_hash
14
- word_hash_for_words(gsub(/[^\w\s]/,"").split + gsub(/[\w]/," ").split)
15
- end
16
-
17
- # Return a word hash without extra punctuation or short symbols, just stemmed words
18
- def clean_word_hash
19
- word_hash_for_words gsub(/[^\w\s]/,"").split
20
- end
21
-
22
- def word_hash_for_words(words)
23
- d = Hash.new
24
- words.each do |word|
25
- word.downcase! if word =~ /[\w]+/
26
- key = word.stem.to_sym
27
- if word =~ /[^\w]/ || ! CORPUS_SKIP_WORDS.include?(word) && word.length > 2
28
- d[key] ||= 0
29
- d[key] += 1
30
- end
31
- end
32
- return d
33
- end
34
-
35
- CORPUS_SKIP_WORDS = [
36
- "a",
37
- "again",
38
- "all",
39
- "along",
40
- "are",
41
- "also",
42
- "an",
43
- "and",
44
- "as",
45
- "at",
46
- "but",
47
- "by",
48
- "came",
49
- "can",
50
- "cant",
51
- "couldnt",
52
- "did",
53
- "didn",
54
- "didnt",
55
- "do",
56
- "doesnt",
57
- "dont",
58
- "ever",
59
- "first",
60
- "from",
61
- "have",
62
- "her",
63
- "here",
64
- "him",
65
- "how",
66
- "i",
67
- "if",
68
- "in",
69
- "into",
70
- "is",
71
- "isnt",
72
- "it",
73
- "itll",
74
- "just",
75
- "last",
76
- "least",
77
- "like",
78
- "most",
79
- "my",
80
- "new",
81
- "no",
82
- "not",
83
- "now",
84
- "of",
85
- "on",
86
- "or",
87
- "should",
88
- "sinc",
89
- "so",
90
- "some",
91
- "th",
92
- "than",
93
- "this",
94
- "that",
95
- "the",
96
- "their",
97
- "then",
98
- "those",
99
- "to",
100
- "told",
101
- "too",
102
- "true",
103
- "try",
104
- "until",
105
- "url",
106
- "us",
107
- "were",
108
- "when",
109
- "whether",
110
- "while",
111
- "with",
112
- "within",
113
- "yes",
114
- "you",
115
- "youll",
116
- ]
117
-
118
2
  def summary( count=10, separator=" [...] " )
119
3
  perform_lsi split_sentences, count, separator
120
4
  end
@@ -6,6 +6,7 @@ module Reclassifier
6
6
  # data based on underlying semantic relations. For more information on the algorithms used,
7
7
  # please consult Wikipedia[http://en.wikipedia.org/wiki/Latent_Semantic_Indexing].
8
8
  class LSI
9
+ include Reclassifier::WordHash
9
10
 
10
11
  attr_reader :word_list
11
12
  attr_accessor :auto_rebuild
@@ -41,7 +42,7 @@ module Reclassifier
41
42
  # lsi.add_item ar, *ar.categories { |x| ar.content }
42
43
  #
43
44
  def add_item( item, *categories, &block )
44
- clean_word_hash = block ? block.call(item).clean_word_hash : item.to_s.clean_word_hash
45
+ clean_word_hash = block ? clean_word_hash(block.call(item)) : clean_word_hash(item.to_s)
45
46
  @items[item] = ContentNode.new(clean_word_hash, *categories)
46
47
  @version += 1
47
48
  build_index if @auto_rebuild
@@ -276,7 +277,7 @@ module Reclassifier
276
277
  if @items[item]
277
278
  return @items[item]
278
279
  else
279
- clean_word_hash = block ? block.call(item).clean_word_hash : item.to_s.clean_word_hash
280
+ clean_word_hash = block ? clean_word_hash(block.call(item)) : clean_word_hash(item.to_s)
280
281
 
281
282
  cn = ContentNode.new(clean_word_hash, &block) # make the node and extract the data
282
283
 
@@ -1,3 +1,3 @@
1
1
  module Reclassifier
2
- VERSION = "0.0.4"
2
+ VERSION = "0.1.4"
3
3
  end
@@ -0,0 +1,111 @@
1
+ module Reclassifier::WordHash
2
+ CORPUS_SKIP_WORDS = ["a",
3
+ "again",
4
+ "all",
5
+ "along",
6
+ "are",
7
+ "also",
8
+ "an",
9
+ "and",
10
+ "as",
11
+ "at",
12
+ "but",
13
+ "by",
14
+ "came",
15
+ "can",
16
+ "cant",
17
+ "couldnt",
18
+ "did",
19
+ "didn",
20
+ "didnt",
21
+ "do",
22
+ "doesnt",
23
+ "dont",
24
+ "ever",
25
+ "first",
26
+ "from",
27
+ "have",
28
+ "her",
29
+ "here",
30
+ "him",
31
+ "how",
32
+ "i",
33
+ "if",
34
+ "in",
35
+ "into",
36
+ "is",
37
+ "isnt",
38
+ "it",
39
+ "itll",
40
+ "just",
41
+ "last",
42
+ "least",
43
+ "like",
44
+ "most",
45
+ "my",
46
+ "new",
47
+ "no",
48
+ "not",
49
+ "now",
50
+ "of",
51
+ "on",
52
+ "or",
53
+ "should",
54
+ "sinc",
55
+ "so",
56
+ "some",
57
+ "th",
58
+ "than",
59
+ "this",
60
+ "that",
61
+ "the",
62
+ "their",
63
+ "then",
64
+ "those",
65
+ "to",
66
+ "told",
67
+ "too",
68
+ "true",
69
+ "try",
70
+ "until",
71
+ "url",
72
+ "us",
73
+ "were",
74
+ "when",
75
+ "whether",
76
+ "while",
77
+ "with",
78
+ "within",
79
+ "yes",
80
+ "you",
81
+ "youll"]
82
+
83
+ # Return a Hash of strings => ints. Each word in the string is stemmed,
84
+ # symbolized, and indexed to its frequency in the document.
85
+ def word_hash(string)
86
+ word_hash_for_words(string.gsub(/[^\w\s]/,"").split + string.gsub(/[\w]/," ").split)
87
+ end
88
+
89
+ # Return a word hash without extra punctuation or short symbols, just stemmed words
90
+ def clean_word_hash(string)
91
+ word_hash_for_words(string.gsub(/[^\w\s]/,"").split)
92
+ end
93
+
94
+ def word_hash_for_words(words)
95
+ d = {}
96
+
97
+ words.each do |word|
98
+ word.downcase!
99
+
100
+ key = word.stem.to_sym
101
+
102
+ if word =~ /[^\w]/ || !CORPUS_SKIP_WORDS.include?(word) && word.length > 2
103
+ d[key] ||= 0
104
+ d[key] += 1
105
+ end
106
+ end
107
+
108
+ d
109
+ end
110
+
111
+ end
data/lib/reclassifier.rb CHANGED
@@ -12,8 +12,9 @@ require 'gsl/vector'
12
12
 
13
13
  module Reclassifier
14
14
  autoload :Bayes, 'reclassifier/bayes'
15
- autoload :LSI, 'reclassifier/lsi'
16
15
  autoload :ContentNode, 'reclassifier/content_node'
17
- autoload :WordList, 'reclassifier/word_list'
16
+ autoload :LSI, 'reclassifier/lsi'
18
17
  autoload :UnknownClassificationError, 'reclassifier/unknown_classification_error'
18
+ autoload :WordHash, 'reclassifier/word_hash'
19
+ autoload :WordList, 'reclassifier/word_list'
19
20
  end
data/spec/bayes_spec.rb CHANGED
@@ -3,7 +3,7 @@ require 'spec_helper'
3
3
  describe Reclassifier::Bayes do
4
4
  describe "classifications" do
5
5
  it "should return the classifications" do
6
- subject = described_class.new(:interesting, :uninteresting)
6
+ subject = described_class.new([:interesting, :uninteresting])
7
7
 
8
8
  subject.classifications.sort.should eq([:interesting, :uninteresting])
9
9
  end
@@ -15,7 +15,7 @@ describe Reclassifier::Bayes do
15
15
  end
16
16
 
17
17
  it "should train the classifier to the (classification, document) pair" do
18
- subject = described_class.new(:in_china, :not_in_china)
18
+ subject = described_class.new([:in_china, :not_in_china])
19
19
 
20
20
  subject.train(:in_china, 'Chinese Beijing Chinese')
21
21
  subject.train(:in_china, 'Chinese Chinese Shanghai')
@@ -32,7 +32,7 @@ describe Reclassifier::Bayes do
32
32
  end
33
33
 
34
34
  it "should untrain the classifier against the (classification, document) pair" do
35
- subject = described_class.new(:in_china, :not_in_china)
35
+ subject = described_class.new([:in_china, :not_in_china])
36
36
 
37
37
  subject.train(:in_china, 'Chinese Chinese')
38
38
  subject.train(:not_in_china, 'Chinese Macao')
@@ -47,7 +47,7 @@ describe Reclassifier::Bayes do
47
47
 
48
48
  describe "calculate_scores" do
49
49
  it "should return a score hash with the correct scores" do
50
- subject = described_class.new(:in_china, :not_in_china)
50
+ subject = described_class.new([:in_china, :not_in_china])
51
51
 
52
52
  subject.train(:in_china, 'Chinese Beijing Chinese')
53
53
  subject.train(:in_china, 'Chinese Chinese Shanghai')
@@ -59,6 +59,14 @@ describe Reclassifier::Bayes do
59
59
  scores[:in_china].should eq(-8.107690312843907)
60
60
  scores[:not_in_china].should eq(-8.906681345001262)
61
61
  end
62
+
63
+ it "should handle the case when no documents are classified for a particular classification" do
64
+ subject = described_class.new([:in_china, :not_in_china])
65
+
66
+ subject.train(:in_china, 'Chinese Beijing Chinese')
67
+
68
+ subject.calculate_scores('Chinese Beijing')
69
+ end
62
70
  end
63
71
 
64
72
  describe "add_classification" do
@@ -94,4 +102,33 @@ describe Reclassifier::Bayes do
94
102
  subject.remove_classification(:niner).should be(nil)
95
103
  end
96
104
  end
105
+
106
+ context ':clean option' do
107
+ it 'should cause punctuation to be omitted if it is set to true' do
108
+ subject = described_class.new([:one, :other], {:clean => true})
109
+
110
+ subject.train(:one, '! ! ! ! bbb')
111
+ subject.train(:other, 'aaa')
112
+
113
+ subject.classify('! aaa !').should eq(:other)
114
+ end
115
+
116
+ it 'should default to true' do
117
+ subject = described_class.new([:one, :other])
118
+
119
+ subject.train(:one, '! ! ! ! bbb')
120
+ subject.train(:other, 'aaa')
121
+
122
+ subject.classify('! aaa !').should eq(:other)
123
+ end
124
+
125
+ it 'should cause punctuation not to be omitted if it is set to false' do
126
+ subject = described_class.new([:one, :other], {:clean => false})
127
+
128
+ subject.train(:one, '! ! ! ! bbb')
129
+ subject.train(:other, 'aaa')
130
+
131
+ subject.classify('! aaa !').should eq(:one)
132
+ end
133
+ end
97
134
  end
data/spec/lsi_spec.rb CHANGED
@@ -45,7 +45,7 @@ describe Reclassifier::LSI do
45
45
  end
46
46
 
47
47
  it "should perform better than Bayes" do
48
- bayes = Reclassifier::Bayes.new :dog, :cat, :bird
48
+ bayes = Reclassifier::Bayes.new([:dog, :cat, :bird])
49
49
 
50
50
  [[@str1, "Dog"],
51
51
  [@str2, "Dog"],
@@ -1,11 +1,11 @@
1
- require 'spec_helper'
1
+ require "spec_helper"
2
2
 
3
- describe String do
3
+ describe Reclassifier::Bayes do
4
4
  describe "word_hash" do
5
5
  it "should hash text" do
6
6
  hash = {:good => 1, :"!" => 1, :hope => 1, :"'" => 1, :"." => 1, :love => 1, :word => 1, :them => 1, :test => 1}
7
7
 
8
- "here are some good words of test's. I hope you love them!".word_hash.should eq(hash)
8
+ subject.word_hash("here are some good words of test's. I hope you love them!").should eq(hash)
9
9
  end
10
10
  end
11
11
 
@@ -13,7 +13,7 @@ describe String do
13
13
  it "should clean and hash text" do
14
14
  hash = {:good => 1, :word => 1, :hope => 1, :love => 1, :them => 1, :test => 1}
15
15
 
16
- "here are some good words of test's. I hope you love them!".clean_word_hash.should eq(hash)
16
+ subject.clean_word_hash("here are some good words of test's. I hope you love them!").should eq(hash)
17
17
  end
18
18
  end
19
19
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: reclassifier
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.4
4
+ version: 0.1.4
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors:
@@ -114,13 +114,14 @@ files:
114
114
  - lib/reclassifier/lsi.rb
115
115
  - lib/reclassifier/unknown_classification_error.rb
116
116
  - lib/reclassifier/version.rb
117
+ - lib/reclassifier/word_hash.rb
117
118
  - lib/reclassifier/word_list.rb
118
119
  - reclassifier.gemspec
119
120
  - spec/bayes_spec.rb
120
121
  - spec/core_ext/array_spec.rb
121
- - spec/core_ext/string_spec.rb
122
122
  - spec/lsi_spec.rb
123
123
  - spec/spec_helper.rb
124
+ - spec/word_hash_spec.rb
124
125
  homepage: https://github.com/saveup/reclassifier
125
126
  licenses:
126
127
  - LGPL
@@ -149,6 +150,6 @@ summary: Bayesian and Latent Semantic Indexing classification of text.
149
150
  test_files:
150
151
  - spec/bayes_spec.rb
151
152
  - spec/core_ext/array_spec.rb
152
- - spec/core_ext/string_spec.rb
153
153
  - spec/lsi_spec.rb
154
154
  - spec/spec_helper.rb
155
+ - spec/word_hash_spec.rb