otherinbox-classifier 1.3.1.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,72 @@
1
+ # Author:: David Fayram (mailto:dfayram@lensmen.net)
2
+ # Copyright:: Copyright (c) 2005 David Fayram II
3
+ # License:: LGPL
4
+
5
+ module Classifier
6
+
7
+ # This is an internal data structure class for the LSI node. Save for
8
+ # raw_vector_with, it should be fairly straightforward to understand.
9
+ # You should never have to use it directly.
10
+ class ContentNode
11
+ attr_accessor :raw_vector, :raw_norm,
12
+ :lsi_vector, :lsi_norm,
13
+ :categories
14
+
15
+ attr_reader :word_hash
16
+ # If text_proc is not specified, the source will be duck-typed
17
+ # via source.to_s
18
+ def initialize( word_hash, *categories )
19
+ @categories = categories || []
20
+ @word_hash = word_hash
21
+ end
22
+
23
+ # Use this to fetch the appropriate search vector.
24
+ def search_vector
25
+ @lsi_vector || @raw_vector
26
+ end
27
+
28
+ # Use this to fetch the appropriate search vector in normalized form.
29
+ def search_norm
30
+ @lsi_norm || @raw_norm
31
+ end
32
+
33
+ # Creates the raw vector out of word_hash using word_list as the
34
+ # key for mapping the vector space.
35
+ def raw_vector_with( word_list )
36
+ if $GSL
37
+ vec = GSL::Vector.alloc(word_list.size)
38
+ else
39
+ vec = Array.new(word_list.size, 0)
40
+ end
41
+
42
+ @word_hash.each_key do |word|
43
+ vec[word_list[word]] = @word_hash[word] if word_list[word]
44
+ end
45
+
46
+ # Perform the scaling transform
47
+ total_words = vec.sum
48
+
49
+ # Perform first-order association transform if this vector has more
50
+ # than one word in it.
51
+ if total_words > 1.0
52
+ weighted_total = 0.0
53
+ vec.each do |term|
54
+ if ( term > 0 )
55
+ weighted_total += (( term / total_words ) * Math.log( term / total_words ))
56
+ end
57
+ end
58
+ vec = vec.collect { |val| Math.log( val + 1 ) / -weighted_total }
59
+ end
60
+
61
+ if $GSL
62
+ @raw_norm = vec.normalize
63
+ @raw_vector = vec
64
+ else
65
+ @raw_norm = Vector[*vec].normalize
66
+ @raw_vector = Vector[*vec]
67
+ end
68
+ end
69
+
70
+ end
71
+
72
+ end
@@ -0,0 +1,31 @@
1
+ # Author:: Lucas Carlson (mailto:lucas@rufy.com)
2
+ # Copyright:: Copyright (c) 2005 Lucas Carlson
3
+ # License:: LGPL
4
+
5
+ class String
6
+ def summary( count=10, separator=" [...] " )
7
+ perform_lsi split_sentences, count, separator
8
+ end
9
+
10
+ def paragraph_summary( count=1, separator=" [...] " )
11
+ perform_lsi split_paragraphs, count, separator
12
+ end
13
+
14
+ def split_sentences
15
+ split /(\.|\!|\?)/ # TODO: make this less primitive
16
+ end
17
+
18
+ def split_paragraphs
19
+ split /(\n\n|\r\r|\r\n\r\n)/ # TODO: make this less primitive
20
+ end
21
+
22
+ private
23
+
24
+ def perform_lsi(chunks, count, separator)
25
+ lsi = Classifier::LSI.new :auto_rebuild => false
26
+ chunks.each { |chunk| lsi << chunk unless chunk.strip.empty? || chunk.strip.split.size == 1 }
27
+ lsi.build_index
28
+ summaries = lsi.highest_relative_content count
29
+ return summaries.reject { |chunk| !summaries.include? chunk }.map { |x| x.strip }.join(separator)
30
+ end
31
+ end
@@ -0,0 +1,36 @@
1
+ # Author:: David Fayram (mailto:dfayram@lensmen.net)
2
+ # Copyright:: Copyright (c) 2005 David Fayram II
3
+ # License:: LGPL
4
+
5
+ module Classifier
6
+ # This class keeps a word => index mapping. It is used to map stemmed words
7
+ # to dimensions of a vector.
8
+
9
+ class WordList
10
+ def initialize
11
+ @location_table = Hash.new
12
+ end
13
+
14
+ # Adds a word (if it is new) and assigns it a unique dimension.
15
+ def add_word(word)
16
+ term = word
17
+ @location_table[term] = @location_table.size unless @location_table[term]
18
+ end
19
+
20
+ # Returns the dimension of the word or nil if the word is not in the space.
21
+ def [](lookup)
22
+ term = lookup
23
+ @location_table[term]
24
+ end
25
+
26
+ def word_for_index(ind)
27
+ @location_table.invert[ind]
28
+ end
29
+
30
+ # Returns the number of words mapped.
31
+ def size
32
+ @location_table.size
33
+ end
34
+
35
+ end
36
+ end
@@ -0,0 +1,33 @@
1
+ require File.dirname(__FILE__) + '/../test_helper'
2
+ class BayesianTest < Test::Unit::TestCase
3
+ def setup
4
+ @classifier = Classifier::Bayes.new 'Interesting', 'Uninteresting'
5
+ end
6
+
7
+ def test_good_training
8
+ assert_nothing_raised { @classifier.train_interesting "love" }
9
+ end
10
+
11
+ def test_bad_training
12
+ assert_raise(StandardError) { @classifier.train_no_category "words" }
13
+ end
14
+
15
+ def test_bad_method
16
+ assert_raise(NoMethodError) { @classifier.forget_everything_you_know "" }
17
+ end
18
+
19
+ def test_categories
20
+ assert_equal ['Interesting', 'Uninteresting'].sort, @classifier.categories.sort
21
+ end
22
+
23
+ def test_add_category
24
+ @classifier.add_category 'Test'
25
+ assert_equal ['Test', 'Interesting', 'Uninteresting'].sort, @classifier.categories.sort
26
+ end
27
+
28
+ def test_classification
29
+ @classifier.train_interesting "here are some good words. I hope you love them"
30
+ @classifier.train_uninteresting "here are some bad words, I hate you"
31
+ assert_equal 'Uninteresting', @classifier.classify("I hate bad words and you")
32
+ end
33
+ end
@@ -0,0 +1,14 @@
1
+ require File.dirname(__FILE__) + '/../test_helper'
2
+ class StringExtensionsTest < Test::Unit::TestCase
3
+ def test_word_hash
4
+ hash = {:good=>1, :"!"=>1, :hope=>1, :"'"=>1, :"."=>1, :love=>1, :word=>1, :them=>1, :test=>1}
5
+ assert_equal hash, "here are some good words of test's. I hope you love them!".word_hash
6
+ end
7
+
8
+
9
+ def test_clean_word_hash
10
+ hash = {:good=>1, :word=>1, :hope=>1, :love=>1, :them=>1, :test=>1}
11
+ assert_equal hash, "here are some good words of test's. I hope you love them!".clean_word_hash
12
+ end
13
+
14
+ end
@@ -0,0 +1,123 @@
1
+ require File.dirname(__FILE__) + '/../test_helper'
2
+ class LSITest < Test::Unit::TestCase
3
+ def setup
4
+ # we repeat principle words to help weight them.
5
+ # This test is rather delicate, since this system is mostly noise.
6
+ @str1 = "This text deals with dogs. Dogs."
7
+ @str2 = "This text involves dogs too. Dogs! "
8
+ @str3 = "This text revolves around cats. Cats."
9
+ @str4 = "This text also involves cats. Cats!"
10
+ @str5 = "This text involves birds. Birds."
11
+ end
12
+
13
+ def test_basic_indexing
14
+ lsi = Classifier::LSI.new
15
+ [@str1, @str2, @str3, @str4, @str5].each { |x| lsi << x }
16
+ assert ! lsi.needs_rebuild?
17
+
18
+ # note that the closest match to str1 is str2, even though it is not
19
+ # the closest text match.
20
+ assert_equal [@str2, @str5, @str3], lsi.find_related(@str1, 3)
21
+ end
22
+
23
+ def test_not_auto_rebuild
24
+ lsi = Classifier::LSI.new :auto_rebuild => false
25
+ lsi.add_item @str1, "Dog"
26
+ lsi.add_item @str2, "Dog"
27
+ assert lsi.needs_rebuild?
28
+ lsi.build_index
29
+ assert ! lsi.needs_rebuild?
30
+ end
31
+
32
+ def test_basic_categorizing
33
+ lsi = Classifier::LSI.new
34
+ lsi.add_item @str2, "Dog"
35
+ lsi.add_item @str3, "Cat"
36
+ lsi.add_item @str4, "Cat"
37
+ lsi.add_item @str5, "Bird"
38
+
39
+ assert_equal "Dog", lsi.classify( @str1 )
40
+ assert_equal "Cat", lsi.classify( @str3 )
41
+ assert_equal "Bird", lsi.classify( @str5 )
42
+ end
43
+
44
+ def test_external_classifying
45
+ lsi = Classifier::LSI.new
46
+ bayes = Classifier::Bayes.new 'Dog', 'Cat', 'Bird'
47
+ lsi.add_item @str1, "Dog" ; bayes.train_dog @str1
48
+ lsi.add_item @str2, "Dog" ; bayes.train_dog @str2
49
+ lsi.add_item @str3, "Cat" ; bayes.train_cat @str3
50
+ lsi.add_item @str4, "Cat" ; bayes.train_cat @str4
51
+ lsi.add_item @str5, "Bird" ; bayes.train_bird @str5
52
+
53
+ # We're talking about dogs. Even though the text matches the corpus on
54
+ # cats better. Dogs have more semantic weight than cats. So bayes
55
+ # will fail here, but the LSI recognizes content.
56
+ tricky_case = "This text revolves around dogs."
57
+ assert_equal "Dog", lsi.classify( tricky_case )
58
+ assert_not_equal "Dog", bayes.classify( tricky_case )
59
+ end
60
+
61
+ def test_recategorize_interface
62
+ lsi = Classifier::LSI.new
63
+ lsi.add_item @str1, "Dog"
64
+ lsi.add_item @str2, "Dog"
65
+ lsi.add_item @str3, "Cat"
66
+ lsi.add_item @str4, "Cat"
67
+ lsi.add_item @str5, "Bird"
68
+
69
+ tricky_case = "This text revolves around dogs."
70
+ assert_equal "Dog", lsi.classify( tricky_case )
71
+
72
+ # Recategorize as needed.
73
+ lsi.categories_for(@str1).clear.push "Cow"
74
+ lsi.categories_for(@str2).clear.push "Cow"
75
+
76
+ assert !lsi.needs_rebuild?
77
+ assert_equal "Cow", lsi.classify( tricky_case )
78
+ end
79
+
80
+ def test_search
81
+ lsi = Classifier::LSI.new
82
+ [@str1, @str2, @str3, @str4, @str5].each { |x| lsi << x }
83
+
84
+ # Searching by content and text, note that @str2 comes up first, because
85
+ # both "dog" and "involve" are present. But, the next match is @str1 instead
86
+ # of @str4, because "dog" carries more weight than involves.
87
+ assert_equal( [@str2, @str1, @str4, @str5, @str3],
88
+ lsi.search("dog involves", 100) )
89
+
90
+ # Keyword search shows how the space is mapped out in relation to
91
+ # dog when magnitude is remove. Note the relations. We move from dog
92
+ # through involve and then finally to other words.
93
+ assert_equal( [@str1, @str2, @str4, @str5, @str3],
94
+ lsi.search("dog", 5) )
95
+ end
96
+
97
+ def test_serialize_safe
98
+ lsi = Classifier::LSI.new
99
+ [@str1, @str2, @str3, @str4, @str5].each { |x| lsi << x }
100
+
101
+ lsi_md = Marshal.dump lsi
102
+ lsi_m = Marshal.load lsi_md
103
+
104
+ assert_equal lsi_m.search("cat", 3), lsi.search("cat", 3)
105
+ assert_equal lsi_m.find_related(@str1, 3), lsi.find_related(@str1, 3)
106
+ end
107
+
108
+ def test_keyword_search
109
+ lsi = Classifier::LSI.new
110
+ lsi.add_item @str1, "Dog"
111
+ lsi.add_item @str2, "Dog"
112
+ lsi.add_item @str3, "Cat"
113
+ lsi.add_item @str4, "Cat"
114
+ lsi.add_item @str5, "Bird"
115
+
116
+ assert_equal [:dog, :text, :deal], lsi.highest_ranked_stems(@str1)
117
+ end
118
+
119
+ def test_summary
120
+ assert_equal "This text involves dogs too [...] This text also involves cats", [@str1, @str2, @str3, @str4, @str5].join.summary(2)
121
+ end
122
+
123
+ end
@@ -0,0 +1,4 @@
1
+ $:.unshift(File.dirname(__FILE__) + '/../lib')
2
+
3
+ require 'test/unit'
4
+ require 'classifier'
metadata ADDED
@@ -0,0 +1,85 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: otherinbox-classifier
3
+ version: !ruby/object:Gem::Version
4
+ version: 1.3.1.1
5
+ platform: ruby
6
+ authors:
7
+ - Lucas Carlson
8
+ autorequire: classifier
9
+ bindir: bin
10
+ cert_chain:
11
+ date: 2008-01-19 00:00:00 -08:00
12
+ default_executable:
13
+ dependencies:
14
+ - !ruby/object:Gem::Dependency
15
+ name: stemmer
16
+ type: :runtime
17
+ version_requirement:
18
+ version_requirements: !ruby/object:Gem::Requirement
19
+ requirements:
20
+ - - ">="
21
+ - !ruby/object:Gem::Version
22
+ version: 1.0.0
23
+ version:
24
+ description: A general classifier module to allow Bayesian and other types of classifications.
25
+ email: lucas@rufy.com
26
+ executables: []
27
+
28
+ extensions: []
29
+
30
+ extra_rdoc_files: []
31
+
32
+ files:
33
+ - lib/classifier.rb
34
+ - lib/classifier
35
+ - lib/classifier/bayes.rb
36
+ - lib/classifier/lsi.rb
37
+ - lib/classifier/extensions
38
+ - lib/classifier/extensions/string.rb
39
+ - lib/classifier/extensions/vector.rb
40
+ - lib/classifier/extensions/vector_serialize.rb
41
+ - lib/classifier/extensions/word_hash.rb
42
+ - lib/classifier/lsi
43
+ - lib/classifier/lsi/content_node.rb
44
+ - lib/classifier/lsi/summary.rb
45
+ - lib/classifier/lsi/word_list.rb
46
+ - bin/bayes.rb
47
+ - bin/summarize.rb
48
+ - test/bayes
49
+ - test/bayes/bayesian_test.rb
50
+ - test/test_helper.rb
51
+ - test/extensions
52
+ - test/extensions/word_hash_test.rb
53
+ - test/lsi
54
+ - test/lsi/lsi_test.rb
55
+ - README
56
+ - Rakefile
57
+ - LICENSE
58
+ has_rdoc: true
59
+ homepage: http://classifier.rufy.com/
60
+ post_install_message:
61
+ rdoc_options: []
62
+
63
+ require_paths:
64
+ - lib
65
+ required_ruby_version: !ruby/object:Gem::Requirement
66
+ requirements:
67
+ - - ">"
68
+ - !ruby/object:Gem::Version
69
+ version: 0.0.0
70
+ version:
71
+ required_rubygems_version: !ruby/object:Gem::Requirement
72
+ requirements:
73
+ - - ">="
74
+ - !ruby/object:Gem::Version
75
+ version: "0"
76
+ version:
77
+ requirements:
78
+ - A porter-stemmer module to split word stems.
79
+ rubyforge_project:
80
+ rubygems_version: 1.2.0
81
+ signing_key:
82
+ specification_version: 1
83
+ summary: A general classifier module to allow Bayesian and other types of classifications.
84
+ test_files: []
85
+