reclassifier 0.0.4 → 0.1.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/lib/reclassifier/bayes.rb +26 -10
- data/lib/reclassifier/core_ext/string.rb +0 -116
- data/lib/reclassifier/lsi.rb +3 -2
- data/lib/reclassifier/version.rb +1 -1
- data/lib/reclassifier/word_hash.rb +111 -0
- data/lib/reclassifier.rb +3 -2
- data/spec/bayes_spec.rb +41 -4
- data/spec/lsi_spec.rb +1 -1
- data/spec/{core_ext/string_spec.rb → word_hash_spec.rb} +4 -4
- metadata +4 -3
data/lib/reclassifier/bayes.rb
CHANGED
@@ -6,15 +6,22 @@
|
|
6
6
|
# Cambridge University Press. 2008, ISBN 0521865719.
|
7
7
|
#
|
8
8
|
class Reclassifier::Bayes
|
9
|
+
include Reclassifier::WordHash
|
10
|
+
|
9
11
|
# Can be created with zero or more classifications, each of which will be
|
10
12
|
# initialized and given a training method. The classifications are specified as
|
11
|
-
# symbols.
|
12
|
-
#
|
13
|
-
|
13
|
+
# an array of symbols. Options are specified in a hash.
|
14
|
+
#
|
15
|
+
# Options:
|
16
|
+
# * :clean - If false, punctuation will be included in the classifier. Otherwise, punctuation will be omitted. Default is true.
|
17
|
+
#
|
18
|
+
# b = Reclassifier::Bayes.new([:interesting, :uninteresting, :spam], :clean => true)
|
19
|
+
def initialize(classifications = [], options = {})
|
14
20
|
@classifications = {}
|
15
|
-
classifications.each {|classification| @classifications[classification] = {}}
|
16
|
-
|
17
21
|
@docs_in_classification_count = {}
|
22
|
+
@options = options
|
23
|
+
|
24
|
+
classifications.each {|classification| add_classification(classification)}
|
18
25
|
end
|
19
26
|
|
20
27
|
#
|
@@ -26,10 +33,9 @@ class Reclassifier::Bayes
|
|
26
33
|
def train(classification, text)
|
27
34
|
ensure_classification_exists(classification)
|
28
35
|
|
29
|
-
@docs_in_classification_count[classification] ||= 0
|
30
36
|
@docs_in_classification_count[classification] += 1
|
31
37
|
|
32
|
-
text.
|
38
|
+
smart_word_hash(text).each do |word, count|
|
33
39
|
@classifications[classification][word] ||= 0
|
34
40
|
|
35
41
|
@classifications[classification][word] += count
|
@@ -49,7 +55,7 @@ class Reclassifier::Bayes
|
|
49
55
|
|
50
56
|
@docs_in_classification_count[classification] -= 1
|
51
57
|
|
52
|
-
text.
|
58
|
+
smart_word_hash(text).each do |word, count|
|
53
59
|
@classifications[classification][word] -= count if @classifications[classification].include?(word)
|
54
60
|
end
|
55
61
|
end
|
@@ -68,11 +74,11 @@ class Reclassifier::Bayes
|
|
68
74
|
scores[classification] -= Math.log(@docs_in_classification_count.values.reduce(:+))
|
69
75
|
|
70
76
|
# likelihood
|
71
|
-
text.
|
77
|
+
smart_word_hash(text).each do |word, count|
|
72
78
|
if @classifications.values.reduce(Set.new) {|set, word_counts| set.merge(word_counts.keys)}.include?(word)
|
73
79
|
scores[classification] += count * Math.log((classification_word_counts[word] || 0) + 1)
|
74
80
|
|
75
|
-
scores[classification] -= count * Math.log(classification_word_counts.values.reduce(:+) + @classifications.values.reduce(Set.new) {|set, word_counts| set.merge(word_counts.keys)}.count)
|
81
|
+
scores[classification] -= count * Math.log(classification_word_counts.values.reduce(:+).to_i + @classifications.values.reduce(Set.new) {|set, word_counts| set.merge(word_counts.keys)}.count)
|
76
82
|
end
|
77
83
|
end
|
78
84
|
end
|
@@ -107,6 +113,8 @@ class Reclassifier::Bayes
|
|
107
113
|
def add_classification(classification)
|
108
114
|
@classifications[classification] ||= {}
|
109
115
|
|
116
|
+
@docs_in_classification_count[classification] ||= 0
|
117
|
+
|
110
118
|
classification
|
111
119
|
end
|
112
120
|
|
@@ -132,4 +140,12 @@ class Reclassifier::Bayes
|
|
132
140
|
def ensure_classification_exists(classification)
|
133
141
|
raise Reclassifier::UnknownClassificationError unless @classifications.include?(classification)
|
134
142
|
end
|
143
|
+
|
144
|
+
def smart_word_hash(string)
|
145
|
+
if @options[:clean] == false
|
146
|
+
word_hash(string)
|
147
|
+
else
|
148
|
+
clean_word_hash(string)
|
149
|
+
end
|
150
|
+
end
|
135
151
|
end
|
@@ -1,120 +1,4 @@
|
|
1
1
|
class String
|
2
|
-
|
3
|
-
# Removes common punctuation symbols, returning a new string.
|
4
|
-
# E.g.,
|
5
|
-
# "Hello (greeting's), with {braces} < >...?".without_punctuation
|
6
|
-
# => "Hello greetings with braces "
|
7
|
-
def without_punctuation
|
8
|
-
tr( ',?.!;:"@#$%^&*()_=+[]{}\|<>/`~', " " ) .tr( "'\-", "")
|
9
|
-
end
|
10
|
-
|
11
|
-
# Return a Hash of strings => ints. Each word in the string is stemmed,
|
12
|
-
# symbolized, and indexed to its frequency in the document.
|
13
|
-
def word_hash
|
14
|
-
word_hash_for_words(gsub(/[^\w\s]/,"").split + gsub(/[\w]/," ").split)
|
15
|
-
end
|
16
|
-
|
17
|
-
# Return a word hash without extra punctuation or short symbols, just stemmed words
|
18
|
-
def clean_word_hash
|
19
|
-
word_hash_for_words gsub(/[^\w\s]/,"").split
|
20
|
-
end
|
21
|
-
|
22
|
-
def word_hash_for_words(words)
|
23
|
-
d = Hash.new
|
24
|
-
words.each do |word|
|
25
|
-
word.downcase! if word =~ /[\w]+/
|
26
|
-
key = word.stem.to_sym
|
27
|
-
if word =~ /[^\w]/ || ! CORPUS_SKIP_WORDS.include?(word) && word.length > 2
|
28
|
-
d[key] ||= 0
|
29
|
-
d[key] += 1
|
30
|
-
end
|
31
|
-
end
|
32
|
-
return d
|
33
|
-
end
|
34
|
-
|
35
|
-
CORPUS_SKIP_WORDS = [
|
36
|
-
"a",
|
37
|
-
"again",
|
38
|
-
"all",
|
39
|
-
"along",
|
40
|
-
"are",
|
41
|
-
"also",
|
42
|
-
"an",
|
43
|
-
"and",
|
44
|
-
"as",
|
45
|
-
"at",
|
46
|
-
"but",
|
47
|
-
"by",
|
48
|
-
"came",
|
49
|
-
"can",
|
50
|
-
"cant",
|
51
|
-
"couldnt",
|
52
|
-
"did",
|
53
|
-
"didn",
|
54
|
-
"didnt",
|
55
|
-
"do",
|
56
|
-
"doesnt",
|
57
|
-
"dont",
|
58
|
-
"ever",
|
59
|
-
"first",
|
60
|
-
"from",
|
61
|
-
"have",
|
62
|
-
"her",
|
63
|
-
"here",
|
64
|
-
"him",
|
65
|
-
"how",
|
66
|
-
"i",
|
67
|
-
"if",
|
68
|
-
"in",
|
69
|
-
"into",
|
70
|
-
"is",
|
71
|
-
"isnt",
|
72
|
-
"it",
|
73
|
-
"itll",
|
74
|
-
"just",
|
75
|
-
"last",
|
76
|
-
"least",
|
77
|
-
"like",
|
78
|
-
"most",
|
79
|
-
"my",
|
80
|
-
"new",
|
81
|
-
"no",
|
82
|
-
"not",
|
83
|
-
"now",
|
84
|
-
"of",
|
85
|
-
"on",
|
86
|
-
"or",
|
87
|
-
"should",
|
88
|
-
"sinc",
|
89
|
-
"so",
|
90
|
-
"some",
|
91
|
-
"th",
|
92
|
-
"than",
|
93
|
-
"this",
|
94
|
-
"that",
|
95
|
-
"the",
|
96
|
-
"their",
|
97
|
-
"then",
|
98
|
-
"those",
|
99
|
-
"to",
|
100
|
-
"told",
|
101
|
-
"too",
|
102
|
-
"true",
|
103
|
-
"try",
|
104
|
-
"until",
|
105
|
-
"url",
|
106
|
-
"us",
|
107
|
-
"were",
|
108
|
-
"when",
|
109
|
-
"whether",
|
110
|
-
"while",
|
111
|
-
"with",
|
112
|
-
"within",
|
113
|
-
"yes",
|
114
|
-
"you",
|
115
|
-
"youll",
|
116
|
-
]
|
117
|
-
|
118
2
|
def summary( count=10, separator=" [...] " )
|
119
3
|
perform_lsi split_sentences, count, separator
|
120
4
|
end
|
data/lib/reclassifier/lsi.rb
CHANGED
@@ -6,6 +6,7 @@ module Reclassifier
|
|
6
6
|
# data based on underlying semantic relations. For more information on the algorithms used,
|
7
7
|
# please consult Wikipedia[http://en.wikipedia.org/wiki/Latent_Semantic_Indexing].
|
8
8
|
class LSI
|
9
|
+
include Reclassifier::WordHash
|
9
10
|
|
10
11
|
attr_reader :word_list
|
11
12
|
attr_accessor :auto_rebuild
|
@@ -41,7 +42,7 @@ module Reclassifier
|
|
41
42
|
# lsi.add_item ar, *ar.categories { |x| ar.content }
|
42
43
|
#
|
43
44
|
def add_item( item, *categories, &block )
|
44
|
-
clean_word_hash = block ? block.call(item)
|
45
|
+
clean_word_hash = block ? clean_word_hash(block.call(item)) : clean_word_hash(item.to_s)
|
45
46
|
@items[item] = ContentNode.new(clean_word_hash, *categories)
|
46
47
|
@version += 1
|
47
48
|
build_index if @auto_rebuild
|
@@ -276,7 +277,7 @@ module Reclassifier
|
|
276
277
|
if @items[item]
|
277
278
|
return @items[item]
|
278
279
|
else
|
279
|
-
clean_word_hash = block ? block.call(item)
|
280
|
+
clean_word_hash = block ? clean_word_hash(block.call(item)) : clean_word_hash(item.to_s)
|
280
281
|
|
281
282
|
cn = ContentNode.new(clean_word_hash, &block) # make the node and extract the data
|
282
283
|
|
data/lib/reclassifier/version.rb
CHANGED
@@ -0,0 +1,111 @@
|
|
1
|
+
module Reclassifier::WordHash
|
2
|
+
CORPUS_SKIP_WORDS = ["a",
|
3
|
+
"again",
|
4
|
+
"all",
|
5
|
+
"along",
|
6
|
+
"are",
|
7
|
+
"also",
|
8
|
+
"an",
|
9
|
+
"and",
|
10
|
+
"as",
|
11
|
+
"at",
|
12
|
+
"but",
|
13
|
+
"by",
|
14
|
+
"came",
|
15
|
+
"can",
|
16
|
+
"cant",
|
17
|
+
"couldnt",
|
18
|
+
"did",
|
19
|
+
"didn",
|
20
|
+
"didnt",
|
21
|
+
"do",
|
22
|
+
"doesnt",
|
23
|
+
"dont",
|
24
|
+
"ever",
|
25
|
+
"first",
|
26
|
+
"from",
|
27
|
+
"have",
|
28
|
+
"her",
|
29
|
+
"here",
|
30
|
+
"him",
|
31
|
+
"how",
|
32
|
+
"i",
|
33
|
+
"if",
|
34
|
+
"in",
|
35
|
+
"into",
|
36
|
+
"is",
|
37
|
+
"isnt",
|
38
|
+
"it",
|
39
|
+
"itll",
|
40
|
+
"just",
|
41
|
+
"last",
|
42
|
+
"least",
|
43
|
+
"like",
|
44
|
+
"most",
|
45
|
+
"my",
|
46
|
+
"new",
|
47
|
+
"no",
|
48
|
+
"not",
|
49
|
+
"now",
|
50
|
+
"of",
|
51
|
+
"on",
|
52
|
+
"or",
|
53
|
+
"should",
|
54
|
+
"sinc",
|
55
|
+
"so",
|
56
|
+
"some",
|
57
|
+
"th",
|
58
|
+
"than",
|
59
|
+
"this",
|
60
|
+
"that",
|
61
|
+
"the",
|
62
|
+
"their",
|
63
|
+
"then",
|
64
|
+
"those",
|
65
|
+
"to",
|
66
|
+
"told",
|
67
|
+
"too",
|
68
|
+
"true",
|
69
|
+
"try",
|
70
|
+
"until",
|
71
|
+
"url",
|
72
|
+
"us",
|
73
|
+
"were",
|
74
|
+
"when",
|
75
|
+
"whether",
|
76
|
+
"while",
|
77
|
+
"with",
|
78
|
+
"within",
|
79
|
+
"yes",
|
80
|
+
"you",
|
81
|
+
"youll"]
|
82
|
+
|
83
|
+
# Return a Hash of strings => ints. Each word in the string is stemmed,
|
84
|
+
# symbolized, and indexed to its frequency in the document.
|
85
|
+
def word_hash(string)
|
86
|
+
word_hash_for_words(string.gsub(/[^\w\s]/,"").split + string.gsub(/[\w]/," ").split)
|
87
|
+
end
|
88
|
+
|
89
|
+
# Return a word hash without extra punctuation or short symbols, just stemmed words
|
90
|
+
def clean_word_hash(string)
|
91
|
+
word_hash_for_words(string.gsub(/[^\w\s]/,"").split)
|
92
|
+
end
|
93
|
+
|
94
|
+
def word_hash_for_words(words)
|
95
|
+
d = {}
|
96
|
+
|
97
|
+
words.each do |word|
|
98
|
+
word.downcase!
|
99
|
+
|
100
|
+
key = word.stem.to_sym
|
101
|
+
|
102
|
+
if word =~ /[^\w]/ || !CORPUS_SKIP_WORDS.include?(word) && word.length > 2
|
103
|
+
d[key] ||= 0
|
104
|
+
d[key] += 1
|
105
|
+
end
|
106
|
+
end
|
107
|
+
|
108
|
+
d
|
109
|
+
end
|
110
|
+
|
111
|
+
end
|
data/lib/reclassifier.rb
CHANGED
@@ -12,8 +12,9 @@ require 'gsl/vector'
|
|
12
12
|
|
13
13
|
module Reclassifier
|
14
14
|
autoload :Bayes, 'reclassifier/bayes'
|
15
|
-
autoload :LSI, 'reclassifier/lsi'
|
16
15
|
autoload :ContentNode, 'reclassifier/content_node'
|
17
|
-
autoload :
|
16
|
+
autoload :LSI, 'reclassifier/lsi'
|
18
17
|
autoload :UnknownClassificationError, 'reclassifier/unknown_classification_error'
|
18
|
+
autoload :WordHash, 'reclassifier/word_hash'
|
19
|
+
autoload :WordList, 'reclassifier/word_list'
|
19
20
|
end
|
data/spec/bayes_spec.rb
CHANGED
@@ -3,7 +3,7 @@ require 'spec_helper'
|
|
3
3
|
describe Reclassifier::Bayes do
|
4
4
|
describe "classifications" do
|
5
5
|
it "should return the classifications" do
|
6
|
-
subject = described_class.new(:interesting, :uninteresting)
|
6
|
+
subject = described_class.new([:interesting, :uninteresting])
|
7
7
|
|
8
8
|
subject.classifications.sort.should eq([:interesting, :uninteresting])
|
9
9
|
end
|
@@ -15,7 +15,7 @@ describe Reclassifier::Bayes do
|
|
15
15
|
end
|
16
16
|
|
17
17
|
it "should train the classifier to the (classification, document) pair" do
|
18
|
-
subject = described_class.new(:in_china, :not_in_china)
|
18
|
+
subject = described_class.new([:in_china, :not_in_china])
|
19
19
|
|
20
20
|
subject.train(:in_china, 'Chinese Beijing Chinese')
|
21
21
|
subject.train(:in_china, 'Chinese Chinese Shanghai')
|
@@ -32,7 +32,7 @@ describe Reclassifier::Bayes do
|
|
32
32
|
end
|
33
33
|
|
34
34
|
it "should untrain the classifier against the (classification, document) pair" do
|
35
|
-
subject = described_class.new(:in_china, :not_in_china)
|
35
|
+
subject = described_class.new([:in_china, :not_in_china])
|
36
36
|
|
37
37
|
subject.train(:in_china, 'Chinese Chinese')
|
38
38
|
subject.train(:not_in_china, 'Chinese Macao')
|
@@ -47,7 +47,7 @@ describe Reclassifier::Bayes do
|
|
47
47
|
|
48
48
|
describe "calculate_scores" do
|
49
49
|
it "should return a score hash with the correct scores" do
|
50
|
-
subject = described_class.new(:in_china, :not_in_china)
|
50
|
+
subject = described_class.new([:in_china, :not_in_china])
|
51
51
|
|
52
52
|
subject.train(:in_china, 'Chinese Beijing Chinese')
|
53
53
|
subject.train(:in_china, 'Chinese Chinese Shanghai')
|
@@ -59,6 +59,14 @@ describe Reclassifier::Bayes do
|
|
59
59
|
scores[:in_china].should eq(-8.107690312843907)
|
60
60
|
scores[:not_in_china].should eq(-8.906681345001262)
|
61
61
|
end
|
62
|
+
|
63
|
+
it "should handle the case when no documents are classified for a particular classification" do
|
64
|
+
subject = described_class.new([:in_china, :not_in_china])
|
65
|
+
|
66
|
+
subject.train(:in_china, 'Chinese Beijing Chinese')
|
67
|
+
|
68
|
+
subject.calculate_scores('Chinese Beijing')
|
69
|
+
end
|
62
70
|
end
|
63
71
|
|
64
72
|
describe "add_classification" do
|
@@ -94,4 +102,33 @@ describe Reclassifier::Bayes do
|
|
94
102
|
subject.remove_classification(:niner).should be(nil)
|
95
103
|
end
|
96
104
|
end
|
105
|
+
|
106
|
+
context ':clean option' do
|
107
|
+
it 'should cause punctuation to be omitted if it is set to true' do
|
108
|
+
subject = described_class.new([:one, :other], {:clean => true})
|
109
|
+
|
110
|
+
subject.train(:one, '! ! ! ! bbb')
|
111
|
+
subject.train(:other, 'aaa')
|
112
|
+
|
113
|
+
subject.classify('! aaa !').should eq(:other)
|
114
|
+
end
|
115
|
+
|
116
|
+
it 'should default to true' do
|
117
|
+
subject = described_class.new([:one, :other])
|
118
|
+
|
119
|
+
subject.train(:one, '! ! ! ! bbb')
|
120
|
+
subject.train(:other, 'aaa')
|
121
|
+
|
122
|
+
subject.classify('! aaa !').should eq(:other)
|
123
|
+
end
|
124
|
+
|
125
|
+
it 'should cause punctuation not to be omitted if it is set to false' do
|
126
|
+
subject = described_class.new([:one, :other], {:clean => false})
|
127
|
+
|
128
|
+
subject.train(:one, '! ! ! ! bbb')
|
129
|
+
subject.train(:other, 'aaa')
|
130
|
+
|
131
|
+
subject.classify('! aaa !').should eq(:one)
|
132
|
+
end
|
133
|
+
end
|
97
134
|
end
|
data/spec/lsi_spec.rb
CHANGED
@@ -1,11 +1,11 @@
|
|
1
|
-
require
|
1
|
+
require "spec_helper"
|
2
2
|
|
3
|
-
describe
|
3
|
+
describe Reclassifier::Bayes do
|
4
4
|
describe "word_hash" do
|
5
5
|
it "should hash text" do
|
6
6
|
hash = {:good => 1, :"!" => 1, :hope => 1, :"'" => 1, :"." => 1, :love => 1, :word => 1, :them => 1, :test => 1}
|
7
7
|
|
8
|
-
"here are some good words of test's. I hope you love them!".
|
8
|
+
subject.word_hash("here are some good words of test's. I hope you love them!").should eq(hash)
|
9
9
|
end
|
10
10
|
end
|
11
11
|
|
@@ -13,7 +13,7 @@ describe String do
|
|
13
13
|
it "should clean and hash text" do
|
14
14
|
hash = {:good => 1, :word => 1, :hope => 1, :love => 1, :them => 1, :test => 1}
|
15
15
|
|
16
|
-
"here are some good words of test's. I hope you love them!".
|
16
|
+
subject.clean_word_hash("here are some good words of test's. I hope you love them!").should eq(hash)
|
17
17
|
end
|
18
18
|
end
|
19
19
|
end
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: reclassifier
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.1.4
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -114,13 +114,14 @@ files:
|
|
114
114
|
- lib/reclassifier/lsi.rb
|
115
115
|
- lib/reclassifier/unknown_classification_error.rb
|
116
116
|
- lib/reclassifier/version.rb
|
117
|
+
- lib/reclassifier/word_hash.rb
|
117
118
|
- lib/reclassifier/word_list.rb
|
118
119
|
- reclassifier.gemspec
|
119
120
|
- spec/bayes_spec.rb
|
120
121
|
- spec/core_ext/array_spec.rb
|
121
|
-
- spec/core_ext/string_spec.rb
|
122
122
|
- spec/lsi_spec.rb
|
123
123
|
- spec/spec_helper.rb
|
124
|
+
- spec/word_hash_spec.rb
|
124
125
|
homepage: https://github.com/saveup/reclassifier
|
125
126
|
licenses:
|
126
127
|
- LGPL
|
@@ -149,6 +150,6 @@ summary: Bayesian and Latent Semantic Indexing classification of text.
|
|
149
150
|
test_files:
|
150
151
|
- spec/bayes_spec.rb
|
151
152
|
- spec/core_ext/array_spec.rb
|
152
|
-
- spec/core_ext/string_spec.rb
|
153
153
|
- spec/lsi_spec.rb
|
154
154
|
- spec/spec_helper.rb
|
155
|
+
- spec/word_hash_spec.rb
|