otherinbox-classifier 1.3.1.1

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,72 @@
1
+ # Author:: David Fayram (mailto:dfayram@lensmen.net)
2
+ # Copyright:: Copyright (c) 2005 David Fayram II
3
+ # License:: LGPL
4
+
5
+ module Classifier
6
+
7
+ # This is an internal data structure class for the LSI node. Save for
8
+ # raw_vector_with, it should be fairly straightforward to understand.
9
+ # You should never have to use it directly.
10
+ class ContentNode
11
+ attr_accessor :raw_vector, :raw_norm,
12
+ :lsi_vector, :lsi_norm,
13
+ :categories
14
+
15
+ attr_reader :word_hash
16
+ # If text_proc is not specified, the source will be duck-typed
17
+ # via source.to_s
18
+ def initialize( word_hash, *categories )
19
+ @categories = categories || []
20
+ @word_hash = word_hash
21
+ end
22
+
23
+ # Use this to fetch the appropriate search vector.
24
+ def search_vector
25
+ @lsi_vector || @raw_vector
26
+ end
27
+
28
+ # Use this to fetch the appropriate search vector in normalized form.
29
+ def search_norm
30
+ @lsi_norm || @raw_norm
31
+ end
32
+
33
+ # Creates the raw vector out of word_hash using word_list as the
34
+ # key for mapping the vector space.
35
+ def raw_vector_with( word_list )
36
+ if $GSL
37
+ vec = GSL::Vector.alloc(word_list.size)
38
+ else
39
+ vec = Array.new(word_list.size, 0)
40
+ end
41
+
42
+ @word_hash.each_key do |word|
43
+ vec[word_list[word]] = @word_hash[word] if word_list[word]
44
+ end
45
+
46
+ # Perform the scaling transform
47
+ total_words = vec.sum
48
+
49
+ # Perform first-order association transform if this vector has more
50
+ # than one word in it.
51
+ if total_words > 1.0
52
+ weighted_total = 0.0
53
+ vec.each do |term|
54
+ if ( term > 0 )
55
+ weighted_total += (( term / total_words ) * Math.log( term / total_words ))
56
+ end
57
+ end
58
+ vec = vec.collect { |val| Math.log( val + 1 ) / -weighted_total }
59
+ end
60
+
61
+ if $GSL
62
+ @raw_norm = vec.normalize
63
+ @raw_vector = vec
64
+ else
65
+ @raw_norm = Vector[*vec].normalize
66
+ @raw_vector = Vector[*vec]
67
+ end
68
+ end
69
+
70
+ end
71
+
72
+ end
@@ -0,0 +1,31 @@
1
+ # Author:: Lucas Carlson (mailto:lucas@rufy.com)
2
+ # Copyright:: Copyright (c) 2005 Lucas Carlson
3
+ # License:: LGPL
4
+
5
+ class String
6
+ def summary( count=10, separator=" [...] " )
7
+ perform_lsi split_sentences, count, separator
8
+ end
9
+
10
+ def paragraph_summary( count=1, separator=" [...] " )
11
+ perform_lsi split_paragraphs, count, separator
12
+ end
13
+
14
+ def split_sentences
15
+ split /(\.|\!|\?)/ # TODO: make this less primitive
16
+ end
17
+
18
+ def split_paragraphs
19
+ split /(\n\n|\r\r|\r\n\r\n)/ # TODO: make this less primitive
20
+ end
21
+
22
+ private
23
+
24
+ def perform_lsi(chunks, count, separator)
25
+ lsi = Classifier::LSI.new :auto_rebuild => false
26
+ chunks.each { |chunk| lsi << chunk unless chunk.strip.empty? || chunk.strip.split.size == 1 }
27
+ lsi.build_index
28
+ summaries = lsi.highest_relative_content count
29
+ return summaries.reject { |chunk| !summaries.include? chunk }.map { |x| x.strip }.join(separator)
30
+ end
31
+ end
@@ -0,0 +1,36 @@
1
+ # Author:: David Fayram (mailto:dfayram@lensmen.net)
2
+ # Copyright:: Copyright (c) 2005 David Fayram II
3
+ # License:: LGPL
4
+
5
+ module Classifier
6
+ # This class keeps a word => index mapping. It is used to map stemmed words
7
+ # to dimensions of a vector.
8
+
9
+ class WordList
10
+ def initialize
11
+ @location_table = Hash.new
12
+ end
13
+
14
+ # Adds a word (if it is new) and assigns it a unique dimension.
15
+ def add_word(word)
16
+ term = word
17
+ @location_table[term] = @location_table.size unless @location_table[term]
18
+ end
19
+
20
+ # Returns the dimension of the word or nil if the word is not in the space.
21
+ def [](lookup)
22
+ term = lookup
23
+ @location_table[term]
24
+ end
25
+
26
+ def word_for_index(ind)
27
+ @location_table.invert[ind]
28
+ end
29
+
30
+ # Returns the number of words mapped.
31
+ def size
32
+ @location_table.size
33
+ end
34
+
35
+ end
36
+ end
@@ -0,0 +1,33 @@
1
+ require File.dirname(__FILE__) + '/../test_helper'
2
+ class BayesianTest < Test::Unit::TestCase
3
+ def setup
4
+ @classifier = Classifier::Bayes.new 'Interesting', 'Uninteresting'
5
+ end
6
+
7
+ def test_good_training
8
+ assert_nothing_raised { @classifier.train_interesting "love" }
9
+ end
10
+
11
+ def test_bad_training
12
+ assert_raise(StandardError) { @classifier.train_no_category "words" }
13
+ end
14
+
15
+ def test_bad_method
16
+ assert_raise(NoMethodError) { @classifier.forget_everything_you_know "" }
17
+ end
18
+
19
+ def test_categories
20
+ assert_equal ['Interesting', 'Uninteresting'].sort, @classifier.categories.sort
21
+ end
22
+
23
+ def test_add_category
24
+ @classifier.add_category 'Test'
25
+ assert_equal ['Test', 'Interesting', 'Uninteresting'].sort, @classifier.categories.sort
26
+ end
27
+
28
+ def test_classification
29
+ @classifier.train_interesting "here are some good words. I hope you love them"
30
+ @classifier.train_uninteresting "here are some bad words, I hate you"
31
+ assert_equal 'Uninteresting', @classifier.classify("I hate bad words and you")
32
+ end
33
+ end
@@ -0,0 +1,14 @@
1
+ require File.dirname(__FILE__) + '/../test_helper'
2
+ class StringExtensionsTest < Test::Unit::TestCase
3
+ def test_word_hash
4
+ hash = {:good=>1, :"!"=>1, :hope=>1, :"'"=>1, :"."=>1, :love=>1, :word=>1, :them=>1, :test=>1}
5
+ assert_equal hash, "here are some good words of test's. I hope you love them!".word_hash
6
+ end
7
+
8
+
9
+ def test_clean_word_hash
10
+ hash = {:good=>1, :word=>1, :hope=>1, :love=>1, :them=>1, :test=>1}
11
+ assert_equal hash, "here are some good words of test's. I hope you love them!".clean_word_hash
12
+ end
13
+
14
+ end
@@ -0,0 +1,123 @@
1
+ require File.dirname(__FILE__) + '/../test_helper'
2
+ class LSITest < Test::Unit::TestCase
3
+ def setup
4
+ # we repeat principle words to help weight them.
5
+ # This test is rather delicate, since this system is mostly noise.
6
+ @str1 = "This text deals with dogs. Dogs."
7
+ @str2 = "This text involves dogs too. Dogs! "
8
+ @str3 = "This text revolves around cats. Cats."
9
+ @str4 = "This text also involves cats. Cats!"
10
+ @str5 = "This text involves birds. Birds."
11
+ end
12
+
13
+ def test_basic_indexing
14
+ lsi = Classifier::LSI.new
15
+ [@str1, @str2, @str3, @str4, @str5].each { |x| lsi << x }
16
+ assert ! lsi.needs_rebuild?
17
+
18
+ # note that the closest match to str1 is str2, even though it is not
19
+ # the closest text match.
20
+ assert_equal [@str2, @str5, @str3], lsi.find_related(@str1, 3)
21
+ end
22
+
23
+ def test_not_auto_rebuild
24
+ lsi = Classifier::LSI.new :auto_rebuild => false
25
+ lsi.add_item @str1, "Dog"
26
+ lsi.add_item @str2, "Dog"
27
+ assert lsi.needs_rebuild?
28
+ lsi.build_index
29
+ assert ! lsi.needs_rebuild?
30
+ end
31
+
32
+ def test_basic_categorizing
33
+ lsi = Classifier::LSI.new
34
+ lsi.add_item @str2, "Dog"
35
+ lsi.add_item @str3, "Cat"
36
+ lsi.add_item @str4, "Cat"
37
+ lsi.add_item @str5, "Bird"
38
+
39
+ assert_equal "Dog", lsi.classify( @str1 )
40
+ assert_equal "Cat", lsi.classify( @str3 )
41
+ assert_equal "Bird", lsi.classify( @str5 )
42
+ end
43
+
44
+ def test_external_classifying
45
+ lsi = Classifier::LSI.new
46
+ bayes = Classifier::Bayes.new 'Dog', 'Cat', 'Bird'
47
+ lsi.add_item @str1, "Dog" ; bayes.train_dog @str1
48
+ lsi.add_item @str2, "Dog" ; bayes.train_dog @str2
49
+ lsi.add_item @str3, "Cat" ; bayes.train_cat @str3
50
+ lsi.add_item @str4, "Cat" ; bayes.train_cat @str4
51
+ lsi.add_item @str5, "Bird" ; bayes.train_bird @str5
52
+
53
+ # We're talking about dogs. Even though the text matches the corpus on
54
+ # cats better. Dogs have more semantic weight than cats. So bayes
55
+ # will fail here, but the LSI recognizes content.
56
+ tricky_case = "This text revolves around dogs."
57
+ assert_equal "Dog", lsi.classify( tricky_case )
58
+ assert_not_equal "Dog", bayes.classify( tricky_case )
59
+ end
60
+
61
+ def test_recategorize_interface
62
+ lsi = Classifier::LSI.new
63
+ lsi.add_item @str1, "Dog"
64
+ lsi.add_item @str2, "Dog"
65
+ lsi.add_item @str3, "Cat"
66
+ lsi.add_item @str4, "Cat"
67
+ lsi.add_item @str5, "Bird"
68
+
69
+ tricky_case = "This text revolves around dogs."
70
+ assert_equal "Dog", lsi.classify( tricky_case )
71
+
72
+ # Recategorize as needed.
73
+ lsi.categories_for(@str1).clear.push "Cow"
74
+ lsi.categories_for(@str2).clear.push "Cow"
75
+
76
+ assert !lsi.needs_rebuild?
77
+ assert_equal "Cow", lsi.classify( tricky_case )
78
+ end
79
+
80
+ def test_search
81
+ lsi = Classifier::LSI.new
82
+ [@str1, @str2, @str3, @str4, @str5].each { |x| lsi << x }
83
+
84
+ # Searching by content and text, note that @str2 comes up first, because
85
+ # both "dog" and "involve" are present. But, the next match is @str1 instead
86
+ # of @str4, because "dog" carries more weight than involves.
87
+ assert_equal( [@str2, @str1, @str4, @str5, @str3],
88
+ lsi.search("dog involves", 100) )
89
+
90
+ # Keyword search shows how the space is mapped out in relation to
91
+ # dog when magnitude is remove. Note the relations. We move from dog
92
+ # through involve and then finally to other words.
93
+ assert_equal( [@str1, @str2, @str4, @str5, @str3],
94
+ lsi.search("dog", 5) )
95
+ end
96
+
97
+ def test_serialize_safe
98
+ lsi = Classifier::LSI.new
99
+ [@str1, @str2, @str3, @str4, @str5].each { |x| lsi << x }
100
+
101
+ lsi_md = Marshal.dump lsi
102
+ lsi_m = Marshal.load lsi_md
103
+
104
+ assert_equal lsi_m.search("cat", 3), lsi.search("cat", 3)
105
+ assert_equal lsi_m.find_related(@str1, 3), lsi.find_related(@str1, 3)
106
+ end
107
+
108
+ def test_keyword_search
109
+ lsi = Classifier::LSI.new
110
+ lsi.add_item @str1, "Dog"
111
+ lsi.add_item @str2, "Dog"
112
+ lsi.add_item @str3, "Cat"
113
+ lsi.add_item @str4, "Cat"
114
+ lsi.add_item @str5, "Bird"
115
+
116
+ assert_equal [:dog, :text, :deal], lsi.highest_ranked_stems(@str1)
117
+ end
118
+
119
+ def test_summary
120
+ assert_equal "This text involves dogs too [...] This text also involves cats", [@str1, @str2, @str3, @str4, @str5].join.summary(2)
121
+ end
122
+
123
+ end
@@ -0,0 +1,4 @@
1
+ $:.unshift(File.dirname(__FILE__) + '/../lib')
2
+
3
+ require 'test/unit'
4
+ require 'classifier'
metadata ADDED
@@ -0,0 +1,85 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: otherinbox-classifier
3
+ version: !ruby/object:Gem::Version
4
+ version: 1.3.1.1
5
+ platform: ruby
6
+ authors:
7
+ - Lucas Carlson
8
+ autorequire: classifier
9
+ bindir: bin
10
+ cert_chain:
11
+ date: 2008-01-19 00:00:00 -08:00
12
+ default_executable:
13
+ dependencies:
14
+ - !ruby/object:Gem::Dependency
15
+ name: stemmer
16
+ type: :runtime
17
+ version_requirement:
18
+ version_requirements: !ruby/object:Gem::Requirement
19
+ requirements:
20
+ - - ">="
21
+ - !ruby/object:Gem::Version
22
+ version: 1.0.0
23
+ version:
24
+ description: A general classifier module to allow Bayesian and other types of classifications.
25
+ email: lucas@rufy.com
26
+ executables: []
27
+
28
+ extensions: []
29
+
30
+ extra_rdoc_files: []
31
+
32
+ files:
33
+ - lib/classifier.rb
34
+ - lib/classifier
35
+ - lib/classifier/bayes.rb
36
+ - lib/classifier/lsi.rb
37
+ - lib/classifier/extensions
38
+ - lib/classifier/extensions/string.rb
39
+ - lib/classifier/extensions/vector.rb
40
+ - lib/classifier/extensions/vector_serialize.rb
41
+ - lib/classifier/extensions/word_hash.rb
42
+ - lib/classifier/lsi
43
+ - lib/classifier/lsi/content_node.rb
44
+ - lib/classifier/lsi/summary.rb
45
+ - lib/classifier/lsi/word_list.rb
46
+ - bin/bayes.rb
47
+ - bin/summarize.rb
48
+ - test/bayes
49
+ - test/bayes/bayesian_test.rb
50
+ - test/test_helper.rb
51
+ - test/extensions
52
+ - test/extensions/word_hash_test.rb
53
+ - test/lsi
54
+ - test/lsi/lsi_test.rb
55
+ - README
56
+ - Rakefile
57
+ - LICENSE
58
+ has_rdoc: true
59
+ homepage: http://classifier.rufy.com/
60
+ post_install_message:
61
+ rdoc_options: []
62
+
63
+ require_paths:
64
+ - lib
65
+ required_ruby_version: !ruby/object:Gem::Requirement
66
+ requirements:
67
+ - - ">"
68
+ - !ruby/object:Gem::Version
69
+ version: 0.0.0
70
+ version:
71
+ required_rubygems_version: !ruby/object:Gem::Requirement
72
+ requirements:
73
+ - - ">="
74
+ - !ruby/object:Gem::Version
75
+ version: "0"
76
+ version:
77
+ requirements:
78
+ - A porter-stemmer module to split word stems.
79
+ rubyforge_project:
80
+ rubygems_version: 1.2.0
81
+ signing_key:
82
+ specification_version: 1
83
+ summary: A general classifier module to allow Bayesian and other types of classifications.
84
+ test_files: []
85
+