reclassifier 0.0.4 → 0.1.4
Sign up to get free protection for your applications and to get access to all the features.
- data/lib/reclassifier/bayes.rb +26 -10
- data/lib/reclassifier/core_ext/string.rb +0 -116
- data/lib/reclassifier/lsi.rb +3 -2
- data/lib/reclassifier/version.rb +1 -1
- data/lib/reclassifier/word_hash.rb +111 -0
- data/lib/reclassifier.rb +3 -2
- data/spec/bayes_spec.rb +41 -4
- data/spec/lsi_spec.rb +1 -1
- data/spec/{core_ext/string_spec.rb → word_hash_spec.rb} +4 -4
- metadata +4 -3
data/lib/reclassifier/bayes.rb
CHANGED
@@ -6,15 +6,22 @@
|
|
6
6
|
# Cambridge University Press. 2008, ISBN 0521865719.
|
7
7
|
#
|
8
8
|
class Reclassifier::Bayes
|
9
|
+
include Reclassifier::WordHash
|
10
|
+
|
9
11
|
# Can be created with zero or more classifications, each of which will be
|
10
12
|
# initialized and given a training method. The classifications are specified as
|
11
|
-
# symbols.
|
12
|
-
#
|
13
|
-
|
13
|
+
# an array of symbols. Options are specified in a hash.
|
14
|
+
#
|
15
|
+
# Options:
|
16
|
+
# * :clean - If false, punctuation will be included in the classifier. Otherwise, punctuation will be omitted. Default is true.
|
17
|
+
#
|
18
|
+
# b = Reclassifier::Bayes.new([:interesting, :uninteresting, :spam], :clean => true)
|
19
|
+
def initialize(classifications = [], options = {})
|
14
20
|
@classifications = {}
|
15
|
-
classifications.each {|classification| @classifications[classification] = {}}
|
16
|
-
|
17
21
|
@docs_in_classification_count = {}
|
22
|
+
@options = options
|
23
|
+
|
24
|
+
classifications.each {|classification| add_classification(classification)}
|
18
25
|
end
|
19
26
|
|
20
27
|
#
|
@@ -26,10 +33,9 @@ class Reclassifier::Bayes
|
|
26
33
|
def train(classification, text)
|
27
34
|
ensure_classification_exists(classification)
|
28
35
|
|
29
|
-
@docs_in_classification_count[classification] ||= 0
|
30
36
|
@docs_in_classification_count[classification] += 1
|
31
37
|
|
32
|
-
text.
|
38
|
+
smart_word_hash(text).each do |word, count|
|
33
39
|
@classifications[classification][word] ||= 0
|
34
40
|
|
35
41
|
@classifications[classification][word] += count
|
@@ -49,7 +55,7 @@ class Reclassifier::Bayes
|
|
49
55
|
|
50
56
|
@docs_in_classification_count[classification] -= 1
|
51
57
|
|
52
|
-
text.
|
58
|
+
smart_word_hash(text).each do |word, count|
|
53
59
|
@classifications[classification][word] -= count if @classifications[classification].include?(word)
|
54
60
|
end
|
55
61
|
end
|
@@ -68,11 +74,11 @@ class Reclassifier::Bayes
|
|
68
74
|
scores[classification] -= Math.log(@docs_in_classification_count.values.reduce(:+))
|
69
75
|
|
70
76
|
# likelihood
|
71
|
-
text.
|
77
|
+
smart_word_hash(text).each do |word, count|
|
72
78
|
if @classifications.values.reduce(Set.new) {|set, word_counts| set.merge(word_counts.keys)}.include?(word)
|
73
79
|
scores[classification] += count * Math.log((classification_word_counts[word] || 0) + 1)
|
74
80
|
|
75
|
-
scores[classification] -= count * Math.log(classification_word_counts.values.reduce(:+) + @classifications.values.reduce(Set.new) {|set, word_counts| set.merge(word_counts.keys)}.count)
|
81
|
+
scores[classification] -= count * Math.log(classification_word_counts.values.reduce(:+).to_i + @classifications.values.reduce(Set.new) {|set, word_counts| set.merge(word_counts.keys)}.count)
|
76
82
|
end
|
77
83
|
end
|
78
84
|
end
|
@@ -107,6 +113,8 @@ class Reclassifier::Bayes
|
|
107
113
|
def add_classification(classification)
|
108
114
|
@classifications[classification] ||= {}
|
109
115
|
|
116
|
+
@docs_in_classification_count[classification] ||= 0
|
117
|
+
|
110
118
|
classification
|
111
119
|
end
|
112
120
|
|
@@ -132,4 +140,12 @@ class Reclassifier::Bayes
|
|
132
140
|
def ensure_classification_exists(classification)
|
133
141
|
raise Reclassifier::UnknownClassificationError unless @classifications.include?(classification)
|
134
142
|
end
|
143
|
+
|
144
|
+
def smart_word_hash(string)
|
145
|
+
if @options[:clean] == false
|
146
|
+
word_hash(string)
|
147
|
+
else
|
148
|
+
clean_word_hash(string)
|
149
|
+
end
|
150
|
+
end
|
135
151
|
end
|
@@ -1,120 +1,4 @@
|
|
1
1
|
class String
|
2
|
-
|
3
|
-
# Removes common punctuation symbols, returning a new string.
|
4
|
-
# E.g.,
|
5
|
-
# "Hello (greeting's), with {braces} < >...?".without_punctuation
|
6
|
-
# => "Hello greetings with braces "
|
7
|
-
def without_punctuation
|
8
|
-
tr( ',?.!;:"@#$%^&*()_=+[]{}\|<>/`~', " " ) .tr( "'\-", "")
|
9
|
-
end
|
10
|
-
|
11
|
-
# Return a Hash of strings => ints. Each word in the string is stemmed,
|
12
|
-
# symbolized, and indexed to its frequency in the document.
|
13
|
-
def word_hash
|
14
|
-
word_hash_for_words(gsub(/[^\w\s]/,"").split + gsub(/[\w]/," ").split)
|
15
|
-
end
|
16
|
-
|
17
|
-
# Return a word hash without extra punctuation or short symbols, just stemmed words
|
18
|
-
def clean_word_hash
|
19
|
-
word_hash_for_words gsub(/[^\w\s]/,"").split
|
20
|
-
end
|
21
|
-
|
22
|
-
def word_hash_for_words(words)
|
23
|
-
d = Hash.new
|
24
|
-
words.each do |word|
|
25
|
-
word.downcase! if word =~ /[\w]+/
|
26
|
-
key = word.stem.to_sym
|
27
|
-
if word =~ /[^\w]/ || ! CORPUS_SKIP_WORDS.include?(word) && word.length > 2
|
28
|
-
d[key] ||= 0
|
29
|
-
d[key] += 1
|
30
|
-
end
|
31
|
-
end
|
32
|
-
return d
|
33
|
-
end
|
34
|
-
|
35
|
-
CORPUS_SKIP_WORDS = [
|
36
|
-
"a",
|
37
|
-
"again",
|
38
|
-
"all",
|
39
|
-
"along",
|
40
|
-
"are",
|
41
|
-
"also",
|
42
|
-
"an",
|
43
|
-
"and",
|
44
|
-
"as",
|
45
|
-
"at",
|
46
|
-
"but",
|
47
|
-
"by",
|
48
|
-
"came",
|
49
|
-
"can",
|
50
|
-
"cant",
|
51
|
-
"couldnt",
|
52
|
-
"did",
|
53
|
-
"didn",
|
54
|
-
"didnt",
|
55
|
-
"do",
|
56
|
-
"doesnt",
|
57
|
-
"dont",
|
58
|
-
"ever",
|
59
|
-
"first",
|
60
|
-
"from",
|
61
|
-
"have",
|
62
|
-
"her",
|
63
|
-
"here",
|
64
|
-
"him",
|
65
|
-
"how",
|
66
|
-
"i",
|
67
|
-
"if",
|
68
|
-
"in",
|
69
|
-
"into",
|
70
|
-
"is",
|
71
|
-
"isnt",
|
72
|
-
"it",
|
73
|
-
"itll",
|
74
|
-
"just",
|
75
|
-
"last",
|
76
|
-
"least",
|
77
|
-
"like",
|
78
|
-
"most",
|
79
|
-
"my",
|
80
|
-
"new",
|
81
|
-
"no",
|
82
|
-
"not",
|
83
|
-
"now",
|
84
|
-
"of",
|
85
|
-
"on",
|
86
|
-
"or",
|
87
|
-
"should",
|
88
|
-
"sinc",
|
89
|
-
"so",
|
90
|
-
"some",
|
91
|
-
"th",
|
92
|
-
"than",
|
93
|
-
"this",
|
94
|
-
"that",
|
95
|
-
"the",
|
96
|
-
"their",
|
97
|
-
"then",
|
98
|
-
"those",
|
99
|
-
"to",
|
100
|
-
"told",
|
101
|
-
"too",
|
102
|
-
"true",
|
103
|
-
"try",
|
104
|
-
"until",
|
105
|
-
"url",
|
106
|
-
"us",
|
107
|
-
"were",
|
108
|
-
"when",
|
109
|
-
"whether",
|
110
|
-
"while",
|
111
|
-
"with",
|
112
|
-
"within",
|
113
|
-
"yes",
|
114
|
-
"you",
|
115
|
-
"youll",
|
116
|
-
]
|
117
|
-
|
118
2
|
def summary( count=10, separator=" [...] " )
|
119
3
|
perform_lsi split_sentences, count, separator
|
120
4
|
end
|
data/lib/reclassifier/lsi.rb
CHANGED
@@ -6,6 +6,7 @@ module Reclassifier
|
|
6
6
|
# data based on underlying semantic relations. For more information on the algorithms used,
|
7
7
|
# please consult Wikipedia[http://en.wikipedia.org/wiki/Latent_Semantic_Indexing].
|
8
8
|
class LSI
|
9
|
+
include Reclassifier::WordHash
|
9
10
|
|
10
11
|
attr_reader :word_list
|
11
12
|
attr_accessor :auto_rebuild
|
@@ -41,7 +42,7 @@ module Reclassifier
|
|
41
42
|
# lsi.add_item ar, *ar.categories { |x| ar.content }
|
42
43
|
#
|
43
44
|
def add_item( item, *categories, &block )
|
44
|
-
clean_word_hash = block ? block.call(item)
|
45
|
+
clean_word_hash = block ? clean_word_hash(block.call(item)) : clean_word_hash(item.to_s)
|
45
46
|
@items[item] = ContentNode.new(clean_word_hash, *categories)
|
46
47
|
@version += 1
|
47
48
|
build_index if @auto_rebuild
|
@@ -276,7 +277,7 @@ module Reclassifier
|
|
276
277
|
if @items[item]
|
277
278
|
return @items[item]
|
278
279
|
else
|
279
|
-
clean_word_hash = block ? block.call(item)
|
280
|
+
clean_word_hash = block ? clean_word_hash(block.call(item)) : clean_word_hash(item.to_s)
|
280
281
|
|
281
282
|
cn = ContentNode.new(clean_word_hash, &block) # make the node and extract the data
|
282
283
|
|
data/lib/reclassifier/version.rb
CHANGED
@@ -0,0 +1,111 @@
|
|
1
|
+
module Reclassifier::WordHash
|
2
|
+
CORPUS_SKIP_WORDS = ["a",
|
3
|
+
"again",
|
4
|
+
"all",
|
5
|
+
"along",
|
6
|
+
"are",
|
7
|
+
"also",
|
8
|
+
"an",
|
9
|
+
"and",
|
10
|
+
"as",
|
11
|
+
"at",
|
12
|
+
"but",
|
13
|
+
"by",
|
14
|
+
"came",
|
15
|
+
"can",
|
16
|
+
"cant",
|
17
|
+
"couldnt",
|
18
|
+
"did",
|
19
|
+
"didn",
|
20
|
+
"didnt",
|
21
|
+
"do",
|
22
|
+
"doesnt",
|
23
|
+
"dont",
|
24
|
+
"ever",
|
25
|
+
"first",
|
26
|
+
"from",
|
27
|
+
"have",
|
28
|
+
"her",
|
29
|
+
"here",
|
30
|
+
"him",
|
31
|
+
"how",
|
32
|
+
"i",
|
33
|
+
"if",
|
34
|
+
"in",
|
35
|
+
"into",
|
36
|
+
"is",
|
37
|
+
"isnt",
|
38
|
+
"it",
|
39
|
+
"itll",
|
40
|
+
"just",
|
41
|
+
"last",
|
42
|
+
"least",
|
43
|
+
"like",
|
44
|
+
"most",
|
45
|
+
"my",
|
46
|
+
"new",
|
47
|
+
"no",
|
48
|
+
"not",
|
49
|
+
"now",
|
50
|
+
"of",
|
51
|
+
"on",
|
52
|
+
"or",
|
53
|
+
"should",
|
54
|
+
"sinc",
|
55
|
+
"so",
|
56
|
+
"some",
|
57
|
+
"th",
|
58
|
+
"than",
|
59
|
+
"this",
|
60
|
+
"that",
|
61
|
+
"the",
|
62
|
+
"their",
|
63
|
+
"then",
|
64
|
+
"those",
|
65
|
+
"to",
|
66
|
+
"told",
|
67
|
+
"too",
|
68
|
+
"true",
|
69
|
+
"try",
|
70
|
+
"until",
|
71
|
+
"url",
|
72
|
+
"us",
|
73
|
+
"were",
|
74
|
+
"when",
|
75
|
+
"whether",
|
76
|
+
"while",
|
77
|
+
"with",
|
78
|
+
"within",
|
79
|
+
"yes",
|
80
|
+
"you",
|
81
|
+
"youll"]
|
82
|
+
|
83
|
+
# Return a Hash of strings => ints. Each word in the string is stemmed,
|
84
|
+
# symbolized, and indexed to its frequency in the document.
|
85
|
+
def word_hash(string)
|
86
|
+
word_hash_for_words(string.gsub(/[^\w\s]/,"").split + string.gsub(/[\w]/," ").split)
|
87
|
+
end
|
88
|
+
|
89
|
+
# Return a word hash without extra punctuation or short symbols, just stemmed words
|
90
|
+
def clean_word_hash(string)
|
91
|
+
word_hash_for_words(string.gsub(/[^\w\s]/,"").split)
|
92
|
+
end
|
93
|
+
|
94
|
+
def word_hash_for_words(words)
|
95
|
+
d = {}
|
96
|
+
|
97
|
+
words.each do |word|
|
98
|
+
word.downcase!
|
99
|
+
|
100
|
+
key = word.stem.to_sym
|
101
|
+
|
102
|
+
if word =~ /[^\w]/ || !CORPUS_SKIP_WORDS.include?(word) && word.length > 2
|
103
|
+
d[key] ||= 0
|
104
|
+
d[key] += 1
|
105
|
+
end
|
106
|
+
end
|
107
|
+
|
108
|
+
d
|
109
|
+
end
|
110
|
+
|
111
|
+
end
|
data/lib/reclassifier.rb
CHANGED
@@ -12,8 +12,9 @@ require 'gsl/vector'
|
|
12
12
|
|
13
13
|
module Reclassifier
|
14
14
|
autoload :Bayes, 'reclassifier/bayes'
|
15
|
-
autoload :LSI, 'reclassifier/lsi'
|
16
15
|
autoload :ContentNode, 'reclassifier/content_node'
|
17
|
-
autoload :
|
16
|
+
autoload :LSI, 'reclassifier/lsi'
|
18
17
|
autoload :UnknownClassificationError, 'reclassifier/unknown_classification_error'
|
18
|
+
autoload :WordHash, 'reclassifier/word_hash'
|
19
|
+
autoload :WordList, 'reclassifier/word_list'
|
19
20
|
end
|
data/spec/bayes_spec.rb
CHANGED
@@ -3,7 +3,7 @@ require 'spec_helper'
|
|
3
3
|
describe Reclassifier::Bayes do
|
4
4
|
describe "classifications" do
|
5
5
|
it "should return the classifications" do
|
6
|
-
subject = described_class.new(:interesting, :uninteresting)
|
6
|
+
subject = described_class.new([:interesting, :uninteresting])
|
7
7
|
|
8
8
|
subject.classifications.sort.should eq([:interesting, :uninteresting])
|
9
9
|
end
|
@@ -15,7 +15,7 @@ describe Reclassifier::Bayes do
|
|
15
15
|
end
|
16
16
|
|
17
17
|
it "should train the classifier to the (classification, document) pair" do
|
18
|
-
subject = described_class.new(:in_china, :not_in_china)
|
18
|
+
subject = described_class.new([:in_china, :not_in_china])
|
19
19
|
|
20
20
|
subject.train(:in_china, 'Chinese Beijing Chinese')
|
21
21
|
subject.train(:in_china, 'Chinese Chinese Shanghai')
|
@@ -32,7 +32,7 @@ describe Reclassifier::Bayes do
|
|
32
32
|
end
|
33
33
|
|
34
34
|
it "should untrain the classifier against the (classification, document) pair" do
|
35
|
-
subject = described_class.new(:in_china, :not_in_china)
|
35
|
+
subject = described_class.new([:in_china, :not_in_china])
|
36
36
|
|
37
37
|
subject.train(:in_china, 'Chinese Chinese')
|
38
38
|
subject.train(:not_in_china, 'Chinese Macao')
|
@@ -47,7 +47,7 @@ describe Reclassifier::Bayes do
|
|
47
47
|
|
48
48
|
describe "calculate_scores" do
|
49
49
|
it "should return a score hash with the correct scores" do
|
50
|
-
subject = described_class.new(:in_china, :not_in_china)
|
50
|
+
subject = described_class.new([:in_china, :not_in_china])
|
51
51
|
|
52
52
|
subject.train(:in_china, 'Chinese Beijing Chinese')
|
53
53
|
subject.train(:in_china, 'Chinese Chinese Shanghai')
|
@@ -59,6 +59,14 @@ describe Reclassifier::Bayes do
|
|
59
59
|
scores[:in_china].should eq(-8.107690312843907)
|
60
60
|
scores[:not_in_china].should eq(-8.906681345001262)
|
61
61
|
end
|
62
|
+
|
63
|
+
it "should handle the case when no documents are classified for a particular classification" do
|
64
|
+
subject = described_class.new([:in_china, :not_in_china])
|
65
|
+
|
66
|
+
subject.train(:in_china, 'Chinese Beijing Chinese')
|
67
|
+
|
68
|
+
subject.calculate_scores('Chinese Beijing')
|
69
|
+
end
|
62
70
|
end
|
63
71
|
|
64
72
|
describe "add_classification" do
|
@@ -94,4 +102,33 @@ describe Reclassifier::Bayes do
|
|
94
102
|
subject.remove_classification(:niner).should be(nil)
|
95
103
|
end
|
96
104
|
end
|
105
|
+
|
106
|
+
context ':clean option' do
|
107
|
+
it 'should cause punctuation to be omitted if it is set to true' do
|
108
|
+
subject = described_class.new([:one, :other], {:clean => true})
|
109
|
+
|
110
|
+
subject.train(:one, '! ! ! ! bbb')
|
111
|
+
subject.train(:other, 'aaa')
|
112
|
+
|
113
|
+
subject.classify('! aaa !').should eq(:other)
|
114
|
+
end
|
115
|
+
|
116
|
+
it 'should default to true' do
|
117
|
+
subject = described_class.new([:one, :other])
|
118
|
+
|
119
|
+
subject.train(:one, '! ! ! ! bbb')
|
120
|
+
subject.train(:other, 'aaa')
|
121
|
+
|
122
|
+
subject.classify('! aaa !').should eq(:other)
|
123
|
+
end
|
124
|
+
|
125
|
+
it 'should cause punctuation not to be omitted if it is set to false' do
|
126
|
+
subject = described_class.new([:one, :other], {:clean => false})
|
127
|
+
|
128
|
+
subject.train(:one, '! ! ! ! bbb')
|
129
|
+
subject.train(:other, 'aaa')
|
130
|
+
|
131
|
+
subject.classify('! aaa !').should eq(:one)
|
132
|
+
end
|
133
|
+
end
|
97
134
|
end
|
data/spec/lsi_spec.rb
CHANGED
@@ -1,11 +1,11 @@
|
|
1
|
-
require
|
1
|
+
require "spec_helper"
|
2
2
|
|
3
|
-
describe
|
3
|
+
describe Reclassifier::Bayes do
|
4
4
|
describe "word_hash" do
|
5
5
|
it "should hash text" do
|
6
6
|
hash = {:good => 1, :"!" => 1, :hope => 1, :"'" => 1, :"." => 1, :love => 1, :word => 1, :them => 1, :test => 1}
|
7
7
|
|
8
|
-
"here are some good words of test's. I hope you love them!".
|
8
|
+
subject.word_hash("here are some good words of test's. I hope you love them!").should eq(hash)
|
9
9
|
end
|
10
10
|
end
|
11
11
|
|
@@ -13,7 +13,7 @@ describe String do
|
|
13
13
|
it "should clean and hash text" do
|
14
14
|
hash = {:good => 1, :word => 1, :hope => 1, :love => 1, :them => 1, :test => 1}
|
15
15
|
|
16
|
-
"here are some good words of test's. I hope you love them!".
|
16
|
+
subject.clean_word_hash("here are some good words of test's. I hope you love them!").should eq(hash)
|
17
17
|
end
|
18
18
|
end
|
19
19
|
end
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: reclassifier
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.1.4
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -114,13 +114,14 @@ files:
|
|
114
114
|
- lib/reclassifier/lsi.rb
|
115
115
|
- lib/reclassifier/unknown_classification_error.rb
|
116
116
|
- lib/reclassifier/version.rb
|
117
|
+
- lib/reclassifier/word_hash.rb
|
117
118
|
- lib/reclassifier/word_list.rb
|
118
119
|
- reclassifier.gemspec
|
119
120
|
- spec/bayes_spec.rb
|
120
121
|
- spec/core_ext/array_spec.rb
|
121
|
-
- spec/core_ext/string_spec.rb
|
122
122
|
- spec/lsi_spec.rb
|
123
123
|
- spec/spec_helper.rb
|
124
|
+
- spec/word_hash_spec.rb
|
124
125
|
homepage: https://github.com/saveup/reclassifier
|
125
126
|
licenses:
|
126
127
|
- LGPL
|
@@ -149,6 +150,6 @@ summary: Bayesian and Latent Semantic Indexing classification of text.
|
|
149
150
|
test_files:
|
150
151
|
- spec/bayes_spec.rb
|
151
152
|
- spec/core_ext/array_spec.rb
|
152
|
-
- spec/core_ext/string_spec.rb
|
153
153
|
- spec/lsi_spec.rb
|
154
154
|
- spec/spec_helper.rb
|
155
|
+
- spec/word_hash_spec.rb
|