otherinbox-classifier 1.3.1.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/LICENSE +429 -0
- data/README +88 -0
- data/Rakefile +96 -0
- data/bin/bayes.rb +36 -0
- data/bin/summarize.rb +16 -0
- data/lib/classifier.rb +30 -0
- data/lib/classifier/bayes.rb +172 -0
- data/lib/classifier/extensions/string.rb +16 -0
- data/lib/classifier/extensions/vector.rb +106 -0
- data/lib/classifier/extensions/vector_serialize.rb +20 -0
- data/lib/classifier/extensions/word_hash.rb +154 -0
- data/lib/classifier/lsi.rb +318 -0
- data/lib/classifier/lsi/content_node.rb +72 -0
- data/lib/classifier/lsi/summary.rb +31 -0
- data/lib/classifier/lsi/word_list.rb +36 -0
- data/test/bayes/bayesian_test.rb +33 -0
- data/test/extensions/word_hash_test.rb +14 -0
- data/test/lsi/lsi_test.rb +123 -0
- data/test/test_helper.rb +4 -0
- metadata +85 -0
@@ -0,0 +1,72 @@
|
|
1
|
+
# Author:: David Fayram (mailto:dfayram@lensmen.net)
|
2
|
+
# Copyright:: Copyright (c) 2005 David Fayram II
|
3
|
+
# License:: LGPL
|
4
|
+
|
5
|
+
module Classifier
|
6
|
+
|
7
|
+
# This is an internal data structure class for the LSI node. Save for
|
8
|
+
# raw_vector_with, it should be fairly straightforward to understand.
|
9
|
+
# You should never have to use it directly.
|
10
|
+
class ContentNode
|
11
|
+
attr_accessor :raw_vector, :raw_norm,
|
12
|
+
:lsi_vector, :lsi_norm,
|
13
|
+
:categories
|
14
|
+
|
15
|
+
attr_reader :word_hash
|
16
|
+
# If text_proc is not specified, the source will be duck-typed
|
17
|
+
# via source.to_s
|
18
|
+
def initialize( word_hash, *categories )
|
19
|
+
@categories = categories || []
|
20
|
+
@word_hash = word_hash
|
21
|
+
end
|
22
|
+
|
23
|
+
# Use this to fetch the appropriate search vector.
|
24
|
+
def search_vector
|
25
|
+
@lsi_vector || @raw_vector
|
26
|
+
end
|
27
|
+
|
28
|
+
# Use this to fetch the appropriate search vector in normalized form.
|
29
|
+
def search_norm
|
30
|
+
@lsi_norm || @raw_norm
|
31
|
+
end
|
32
|
+
|
33
|
+
# Creates the raw vector out of word_hash using word_list as the
|
34
|
+
# key for mapping the vector space.
|
35
|
+
def raw_vector_with( word_list )
|
36
|
+
if $GSL
|
37
|
+
vec = GSL::Vector.alloc(word_list.size)
|
38
|
+
else
|
39
|
+
vec = Array.new(word_list.size, 0)
|
40
|
+
end
|
41
|
+
|
42
|
+
@word_hash.each_key do |word|
|
43
|
+
vec[word_list[word]] = @word_hash[word] if word_list[word]
|
44
|
+
end
|
45
|
+
|
46
|
+
# Perform the scaling transform
|
47
|
+
total_words = vec.sum
|
48
|
+
|
49
|
+
# Perform first-order association transform if this vector has more
|
50
|
+
# than one word in it.
|
51
|
+
if total_words > 1.0
|
52
|
+
weighted_total = 0.0
|
53
|
+
vec.each do |term|
|
54
|
+
if ( term > 0 )
|
55
|
+
weighted_total += (( term / total_words ) * Math.log( term / total_words ))
|
56
|
+
end
|
57
|
+
end
|
58
|
+
vec = vec.collect { |val| Math.log( val + 1 ) / -weighted_total }
|
59
|
+
end
|
60
|
+
|
61
|
+
if $GSL
|
62
|
+
@raw_norm = vec.normalize
|
63
|
+
@raw_vector = vec
|
64
|
+
else
|
65
|
+
@raw_norm = Vector[*vec].normalize
|
66
|
+
@raw_vector = Vector[*vec]
|
67
|
+
end
|
68
|
+
end
|
69
|
+
|
70
|
+
end
|
71
|
+
|
72
|
+
end
|
@@ -0,0 +1,31 @@
|
|
1
|
+
# Author:: Lucas Carlson (mailto:lucas@rufy.com)
|
2
|
+
# Copyright:: Copyright (c) 2005 Lucas Carlson
|
3
|
+
# License:: LGPL
|
4
|
+
|
5
|
+
class String
|
6
|
+
def summary( count=10, separator=" [...] " )
|
7
|
+
perform_lsi split_sentences, count, separator
|
8
|
+
end
|
9
|
+
|
10
|
+
def paragraph_summary( count=1, separator=" [...] " )
|
11
|
+
perform_lsi split_paragraphs, count, separator
|
12
|
+
end
|
13
|
+
|
14
|
+
def split_sentences
|
15
|
+
split /(\.|\!|\?)/ # TODO: make this less primitive
|
16
|
+
end
|
17
|
+
|
18
|
+
def split_paragraphs
|
19
|
+
split /(\n\n|\r\r|\r\n\r\n)/ # TODO: make this less primitive
|
20
|
+
end
|
21
|
+
|
22
|
+
private
|
23
|
+
|
24
|
+
def perform_lsi(chunks, count, separator)
|
25
|
+
lsi = Classifier::LSI.new :auto_rebuild => false
|
26
|
+
chunks.each { |chunk| lsi << chunk unless chunk.strip.empty? || chunk.strip.split.size == 1 }
|
27
|
+
lsi.build_index
|
28
|
+
summaries = lsi.highest_relative_content count
|
29
|
+
return summaries.reject { |chunk| !summaries.include? chunk }.map { |x| x.strip }.join(separator)
|
30
|
+
end
|
31
|
+
end
|
@@ -0,0 +1,36 @@
|
|
1
|
+
# Author:: David Fayram (mailto:dfayram@lensmen.net)
|
2
|
+
# Copyright:: Copyright (c) 2005 David Fayram II
|
3
|
+
# License:: LGPL
|
4
|
+
|
5
|
+
module Classifier
|
6
|
+
# This class keeps a word => index mapping. It is used to map stemmed words
|
7
|
+
# to dimensions of a vector.
|
8
|
+
|
9
|
+
class WordList
|
10
|
+
def initialize
|
11
|
+
@location_table = Hash.new
|
12
|
+
end
|
13
|
+
|
14
|
+
# Adds a word (if it is new) and assigns it a unique dimension.
|
15
|
+
def add_word(word)
|
16
|
+
term = word
|
17
|
+
@location_table[term] = @location_table.size unless @location_table[term]
|
18
|
+
end
|
19
|
+
|
20
|
+
# Returns the dimension of the word or nil if the word is not in the space.
|
21
|
+
def [](lookup)
|
22
|
+
term = lookup
|
23
|
+
@location_table[term]
|
24
|
+
end
|
25
|
+
|
26
|
+
def word_for_index(ind)
|
27
|
+
@location_table.invert[ind]
|
28
|
+
end
|
29
|
+
|
30
|
+
# Returns the number of words mapped.
|
31
|
+
def size
|
32
|
+
@location_table.size
|
33
|
+
end
|
34
|
+
|
35
|
+
end
|
36
|
+
end
|
@@ -0,0 +1,33 @@
|
|
1
|
+
require File.dirname(__FILE__) + '/../test_helper'
|
2
|
+
class BayesianTest < Test::Unit::TestCase
|
3
|
+
def setup
|
4
|
+
@classifier = Classifier::Bayes.new 'Interesting', 'Uninteresting'
|
5
|
+
end
|
6
|
+
|
7
|
+
def test_good_training
|
8
|
+
assert_nothing_raised { @classifier.train_interesting "love" }
|
9
|
+
end
|
10
|
+
|
11
|
+
def test_bad_training
|
12
|
+
assert_raise(StandardError) { @classifier.train_no_category "words" }
|
13
|
+
end
|
14
|
+
|
15
|
+
def test_bad_method
|
16
|
+
assert_raise(NoMethodError) { @classifier.forget_everything_you_know "" }
|
17
|
+
end
|
18
|
+
|
19
|
+
def test_categories
|
20
|
+
assert_equal ['Interesting', 'Uninteresting'].sort, @classifier.categories.sort
|
21
|
+
end
|
22
|
+
|
23
|
+
def test_add_category
|
24
|
+
@classifier.add_category 'Test'
|
25
|
+
assert_equal ['Test', 'Interesting', 'Uninteresting'].sort, @classifier.categories.sort
|
26
|
+
end
|
27
|
+
|
28
|
+
def test_classification
|
29
|
+
@classifier.train_interesting "here are some good words. I hope you love them"
|
30
|
+
@classifier.train_uninteresting "here are some bad words, I hate you"
|
31
|
+
assert_equal 'Uninteresting', @classifier.classify("I hate bad words and you")
|
32
|
+
end
|
33
|
+
end
|
@@ -0,0 +1,14 @@
|
|
1
|
+
require File.dirname(__FILE__) + '/../test_helper'
|
2
|
+
class StringExtensionsTest < Test::Unit::TestCase
|
3
|
+
def test_word_hash
|
4
|
+
hash = {:good=>1, :"!"=>1, :hope=>1, :"'"=>1, :"."=>1, :love=>1, :word=>1, :them=>1, :test=>1}
|
5
|
+
assert_equal hash, "here are some good words of test's. I hope you love them!".word_hash
|
6
|
+
end
|
7
|
+
|
8
|
+
|
9
|
+
def test_clean_word_hash
|
10
|
+
hash = {:good=>1, :word=>1, :hope=>1, :love=>1, :them=>1, :test=>1}
|
11
|
+
assert_equal hash, "here are some good words of test's. I hope you love them!".clean_word_hash
|
12
|
+
end
|
13
|
+
|
14
|
+
end
|
@@ -0,0 +1,123 @@
|
|
1
|
+
require File.dirname(__FILE__) + '/../test_helper'
|
2
|
+
class LSITest < Test::Unit::TestCase
|
3
|
+
def setup
|
4
|
+
# we repeat principle words to help weight them.
|
5
|
+
# This test is rather delicate, since this system is mostly noise.
|
6
|
+
@str1 = "This text deals with dogs. Dogs."
|
7
|
+
@str2 = "This text involves dogs too. Dogs! "
|
8
|
+
@str3 = "This text revolves around cats. Cats."
|
9
|
+
@str4 = "This text also involves cats. Cats!"
|
10
|
+
@str5 = "This text involves birds. Birds."
|
11
|
+
end
|
12
|
+
|
13
|
+
def test_basic_indexing
|
14
|
+
lsi = Classifier::LSI.new
|
15
|
+
[@str1, @str2, @str3, @str4, @str5].each { |x| lsi << x }
|
16
|
+
assert ! lsi.needs_rebuild?
|
17
|
+
|
18
|
+
# note that the closest match to str1 is str2, even though it is not
|
19
|
+
# the closest text match.
|
20
|
+
assert_equal [@str2, @str5, @str3], lsi.find_related(@str1, 3)
|
21
|
+
end
|
22
|
+
|
23
|
+
def test_not_auto_rebuild
|
24
|
+
lsi = Classifier::LSI.new :auto_rebuild => false
|
25
|
+
lsi.add_item @str1, "Dog"
|
26
|
+
lsi.add_item @str2, "Dog"
|
27
|
+
assert lsi.needs_rebuild?
|
28
|
+
lsi.build_index
|
29
|
+
assert ! lsi.needs_rebuild?
|
30
|
+
end
|
31
|
+
|
32
|
+
def test_basic_categorizing
|
33
|
+
lsi = Classifier::LSI.new
|
34
|
+
lsi.add_item @str2, "Dog"
|
35
|
+
lsi.add_item @str3, "Cat"
|
36
|
+
lsi.add_item @str4, "Cat"
|
37
|
+
lsi.add_item @str5, "Bird"
|
38
|
+
|
39
|
+
assert_equal "Dog", lsi.classify( @str1 )
|
40
|
+
assert_equal "Cat", lsi.classify( @str3 )
|
41
|
+
assert_equal "Bird", lsi.classify( @str5 )
|
42
|
+
end
|
43
|
+
|
44
|
+
def test_external_classifying
|
45
|
+
lsi = Classifier::LSI.new
|
46
|
+
bayes = Classifier::Bayes.new 'Dog', 'Cat', 'Bird'
|
47
|
+
lsi.add_item @str1, "Dog" ; bayes.train_dog @str1
|
48
|
+
lsi.add_item @str2, "Dog" ; bayes.train_dog @str2
|
49
|
+
lsi.add_item @str3, "Cat" ; bayes.train_cat @str3
|
50
|
+
lsi.add_item @str4, "Cat" ; bayes.train_cat @str4
|
51
|
+
lsi.add_item @str5, "Bird" ; bayes.train_bird @str5
|
52
|
+
|
53
|
+
# We're talking about dogs. Even though the text matches the corpus on
|
54
|
+
# cats better. Dogs have more semantic weight than cats. So bayes
|
55
|
+
# will fail here, but the LSI recognizes content.
|
56
|
+
tricky_case = "This text revolves around dogs."
|
57
|
+
assert_equal "Dog", lsi.classify( tricky_case )
|
58
|
+
assert_not_equal "Dog", bayes.classify( tricky_case )
|
59
|
+
end
|
60
|
+
|
61
|
+
def test_recategorize_interface
|
62
|
+
lsi = Classifier::LSI.new
|
63
|
+
lsi.add_item @str1, "Dog"
|
64
|
+
lsi.add_item @str2, "Dog"
|
65
|
+
lsi.add_item @str3, "Cat"
|
66
|
+
lsi.add_item @str4, "Cat"
|
67
|
+
lsi.add_item @str5, "Bird"
|
68
|
+
|
69
|
+
tricky_case = "This text revolves around dogs."
|
70
|
+
assert_equal "Dog", lsi.classify( tricky_case )
|
71
|
+
|
72
|
+
# Recategorize as needed.
|
73
|
+
lsi.categories_for(@str1).clear.push "Cow"
|
74
|
+
lsi.categories_for(@str2).clear.push "Cow"
|
75
|
+
|
76
|
+
assert !lsi.needs_rebuild?
|
77
|
+
assert_equal "Cow", lsi.classify( tricky_case )
|
78
|
+
end
|
79
|
+
|
80
|
+
def test_search
|
81
|
+
lsi = Classifier::LSI.new
|
82
|
+
[@str1, @str2, @str3, @str4, @str5].each { |x| lsi << x }
|
83
|
+
|
84
|
+
# Searching by content and text, note that @str2 comes up first, because
|
85
|
+
# both "dog" and "involve" are present. But, the next match is @str1 instead
|
86
|
+
# of @str4, because "dog" carries more weight than involves.
|
87
|
+
assert_equal( [@str2, @str1, @str4, @str5, @str3],
|
88
|
+
lsi.search("dog involves", 100) )
|
89
|
+
|
90
|
+
# Keyword search shows how the space is mapped out in relation to
|
91
|
+
# dog when magnitude is remove. Note the relations. We move from dog
|
92
|
+
# through involve and then finally to other words.
|
93
|
+
assert_equal( [@str1, @str2, @str4, @str5, @str3],
|
94
|
+
lsi.search("dog", 5) )
|
95
|
+
end
|
96
|
+
|
97
|
+
def test_serialize_safe
|
98
|
+
lsi = Classifier::LSI.new
|
99
|
+
[@str1, @str2, @str3, @str4, @str5].each { |x| lsi << x }
|
100
|
+
|
101
|
+
lsi_md = Marshal.dump lsi
|
102
|
+
lsi_m = Marshal.load lsi_md
|
103
|
+
|
104
|
+
assert_equal lsi_m.search("cat", 3), lsi.search("cat", 3)
|
105
|
+
assert_equal lsi_m.find_related(@str1, 3), lsi.find_related(@str1, 3)
|
106
|
+
end
|
107
|
+
|
108
|
+
def test_keyword_search
|
109
|
+
lsi = Classifier::LSI.new
|
110
|
+
lsi.add_item @str1, "Dog"
|
111
|
+
lsi.add_item @str2, "Dog"
|
112
|
+
lsi.add_item @str3, "Cat"
|
113
|
+
lsi.add_item @str4, "Cat"
|
114
|
+
lsi.add_item @str5, "Bird"
|
115
|
+
|
116
|
+
assert_equal [:dog, :text, :deal], lsi.highest_ranked_stems(@str1)
|
117
|
+
end
|
118
|
+
|
119
|
+
def test_summary
|
120
|
+
assert_equal "This text involves dogs too [...] This text also involves cats", [@str1, @str2, @str3, @str4, @str5].join.summary(2)
|
121
|
+
end
|
122
|
+
|
123
|
+
end
|
data/test/test_helper.rb
ADDED
metadata
ADDED
@@ -0,0 +1,85 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: otherinbox-classifier
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 1.3.1.1
|
5
|
+
platform: ruby
|
6
|
+
authors:
|
7
|
+
- Lucas Carlson
|
8
|
+
autorequire: classifier
|
9
|
+
bindir: bin
|
10
|
+
cert_chain:
|
11
|
+
date: 2008-01-19 00:00:00 -08:00
|
12
|
+
default_executable:
|
13
|
+
dependencies:
|
14
|
+
- !ruby/object:Gem::Dependency
|
15
|
+
name: stemmer
|
16
|
+
type: :runtime
|
17
|
+
version_requirement:
|
18
|
+
version_requirements: !ruby/object:Gem::Requirement
|
19
|
+
requirements:
|
20
|
+
- - ">="
|
21
|
+
- !ruby/object:Gem::Version
|
22
|
+
version: 1.0.0
|
23
|
+
version:
|
24
|
+
description: A general classifier module to allow Bayesian and other types of classifications.
|
25
|
+
email: lucas@rufy.com
|
26
|
+
executables: []
|
27
|
+
|
28
|
+
extensions: []
|
29
|
+
|
30
|
+
extra_rdoc_files: []
|
31
|
+
|
32
|
+
files:
|
33
|
+
- lib/classifier.rb
|
34
|
+
- lib/classifier
|
35
|
+
- lib/classifier/bayes.rb
|
36
|
+
- lib/classifier/lsi.rb
|
37
|
+
- lib/classifier/extensions
|
38
|
+
- lib/classifier/extensions/string.rb
|
39
|
+
- lib/classifier/extensions/vector.rb
|
40
|
+
- lib/classifier/extensions/vector_serialize.rb
|
41
|
+
- lib/classifier/extensions/word_hash.rb
|
42
|
+
- lib/classifier/lsi
|
43
|
+
- lib/classifier/lsi/content_node.rb
|
44
|
+
- lib/classifier/lsi/summary.rb
|
45
|
+
- lib/classifier/lsi/word_list.rb
|
46
|
+
- bin/bayes.rb
|
47
|
+
- bin/summarize.rb
|
48
|
+
- test/bayes
|
49
|
+
- test/bayes/bayesian_test.rb
|
50
|
+
- test/test_helper.rb
|
51
|
+
- test/extensions
|
52
|
+
- test/extensions/word_hash_test.rb
|
53
|
+
- test/lsi
|
54
|
+
- test/lsi/lsi_test.rb
|
55
|
+
- README
|
56
|
+
- Rakefile
|
57
|
+
- LICENSE
|
58
|
+
has_rdoc: true
|
59
|
+
homepage: http://classifier.rufy.com/
|
60
|
+
post_install_message:
|
61
|
+
rdoc_options: []
|
62
|
+
|
63
|
+
require_paths:
|
64
|
+
- lib
|
65
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
66
|
+
requirements:
|
67
|
+
- - ">"
|
68
|
+
- !ruby/object:Gem::Version
|
69
|
+
version: 0.0.0
|
70
|
+
version:
|
71
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
72
|
+
requirements:
|
73
|
+
- - ">="
|
74
|
+
- !ruby/object:Gem::Version
|
75
|
+
version: "0"
|
76
|
+
version:
|
77
|
+
requirements:
|
78
|
+
- A porter-stemmer module to split word stems.
|
79
|
+
rubyforge_project:
|
80
|
+
rubygems_version: 1.2.0
|
81
|
+
signing_key:
|
82
|
+
specification_version: 1
|
83
|
+
summary: A general classifier module to allow Bayesian and other types of classifications.
|
84
|
+
test_files: []
|
85
|
+
|