secobarbital-classifier 1.3.1.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,31 @@
1
+ # Author:: Lucas Carlson (mailto:lucas@rufy.com)
2
+ # Copyright:: Copyright (c) 2005 Lucas Carlson
3
+ # License:: LGPL
4
+
5
+ class String
6
+ def summary( count=10, separator=" [...] " )
7
+ perform_lsi split_sentences, count, separator
8
+ end
9
+
10
+ def paragraph_summary( count=1, separator=" [...] " )
11
+ perform_lsi split_paragraphs, count, separator
12
+ end
13
+
14
+ def split_sentences
15
+ split /(\.|\!|\?)/ # TODO: make this less primitive
16
+ end
17
+
18
+ def split_paragraphs
19
+ split /(\n\n|\r\r|\r\n\r\n)/ # TODO: make this less primitive
20
+ end
21
+
22
+ private
23
+
24
+ def perform_lsi(chunks, count, separator)
25
+ lsi = Classifier::LSI.new :auto_rebuild => false
26
+ chunks.each { |chunk| lsi << chunk unless chunk.strip.empty? || chunk.strip.split.size == 1 }
27
+ lsi.build_index
28
+ summaries = lsi.highest_relative_content count
29
+ return summaries.reject { |chunk| !summaries.include? chunk }.map { |x| x.strip }.join(separator)
30
+ end
31
+ end
@@ -0,0 +1,36 @@
1
+ # Author:: David Fayram (mailto:dfayram@lensmen.net)
2
+ # Copyright:: Copyright (c) 2005 David Fayram II
3
+ # License:: LGPL
4
+
5
+ module Classifier
6
+ # This class keeps a word => index mapping. It is used to map stemmed words
7
+ # to dimensions of a vector.
8
+
9
+ class WordList
10
+ def initialize
11
+ @location_table = Hash.new
12
+ end
13
+
14
+ # Adds a word (if it is new) and assigns it a unique dimension.
15
+ def add_word(word)
16
+ term = word
17
+ @location_table[term] = @location_table.size unless @location_table[term]
18
+ end
19
+
20
+ # Returns the dimension of the word or nil if the word is not in the space.
21
+ def [](lookup)
22
+ term = lookup
23
+ @location_table[term]
24
+ end
25
+
26
+ def word_for_index(ind)
27
+ @location_table.invert[ind]
28
+ end
29
+
30
+ # Returns the number of words mapped.
31
+ def size
32
+ @location_table.size
33
+ end
34
+
35
+ end
36
+ end
@@ -0,0 +1,33 @@
1
+ require File.dirname(__FILE__) + '/../test_helper'
2
+ class BayesianTest < Test::Unit::TestCase
3
+ def setup
4
+ @classifier = Classifier::Bayes.new 'Interesting', 'Uninteresting'
5
+ end
6
+
7
+ def test_good_training
8
+ assert_nothing_raised { @classifier.train_interesting "love" }
9
+ end
10
+
11
+ def test_bad_training
12
+ assert_raise(StandardError) { @classifier.train_no_category "words" }
13
+ end
14
+
15
+ def test_bad_method
16
+ assert_raise(NoMethodError) { @classifier.forget_everything_you_know "" }
17
+ end
18
+
19
+ def test_categories
20
+ assert_equal ['Interesting', 'Uninteresting'].sort, @classifier.categories.sort
21
+ end
22
+
23
+ def test_add_category
24
+ @classifier.add_category 'Test'
25
+ assert_equal ['Test', 'Interesting', 'Uninteresting'].sort, @classifier.categories.sort
26
+ end
27
+
28
+ def test_classification
29
+ @classifier.train_interesting "here are some good words. I hope you love them"
30
+ @classifier.train_uninteresting "here are some bad words, I hate you"
31
+ assert_equal 'Uninteresting', @classifier.classify("I hate bad words and you")
32
+ end
33
+ end
@@ -0,0 +1,14 @@
1
+ require File.dirname(__FILE__) + '/../test_helper'
2
+ class StringExtensionsTest < Test::Unit::TestCase
3
+ def test_word_hash
4
+ hash = {:good=>1, :"!"=>1, :hope=>1, :"'"=>1, :"."=>1, :love=>1, :word=>1, :them=>1, :test=>1}
5
+ assert_equal hash, "here are some good words of test's. I hope you love them!".word_hash
6
+ end
7
+
8
+
9
+ def test_clean_word_hash
10
+ hash = {:good=>1, :word=>1, :hope=>1, :love=>1, :them=>1, :test=>1}
11
+ assert_equal hash, "here are some good words of test's. I hope you love them!".clean_word_hash
12
+ end
13
+
14
+ end
@@ -0,0 +1,123 @@
1
+ require File.dirname(__FILE__) + '/../test_helper'
2
+ class LSITest < Test::Unit::TestCase
3
+ def setup
4
+ # we repeat principle words to help weight them.
5
+ # This test is rather delicate, since this system is mostly noise.
6
+ @str1 = "This text deals with dogs. Dogs."
7
+ @str2 = "This text involves dogs too. Dogs! "
8
+ @str3 = "This text revolves around cats. Cats."
9
+ @str4 = "This text also involves cats. Cats!"
10
+ @str5 = "This text involves birds. Birds."
11
+ end
12
+
13
+ def test_basic_indexing
14
+ lsi = Classifier::LSI.new
15
+ [@str1, @str2, @str3, @str4, @str5].each { |x| lsi << x }
16
+ assert ! lsi.needs_rebuild?
17
+
18
+ # note that the closest match to str1 is str2, even though it is not
19
+ # the closest text match.
20
+ assert_equal [@str2, @str5, @str3], lsi.find_related(@str1, 3)
21
+ end
22
+
23
+ def test_not_auto_rebuild
24
+ lsi = Classifier::LSI.new :auto_rebuild => false
25
+ lsi.add_item @str1, "Dog"
26
+ lsi.add_item @str2, "Dog"
27
+ assert lsi.needs_rebuild?
28
+ lsi.build_index
29
+ assert ! lsi.needs_rebuild?
30
+ end
31
+
32
+ def test_basic_categorizing
33
+ lsi = Classifier::LSI.new
34
+ lsi.add_item @str2, "Dog"
35
+ lsi.add_item @str3, "Cat"
36
+ lsi.add_item @str4, "Cat"
37
+ lsi.add_item @str5, "Bird"
38
+
39
+ assert_equal "Dog", lsi.classify( @str1 )
40
+ assert_equal "Cat", lsi.classify( @str3 )
41
+ assert_equal "Bird", lsi.classify( @str5 )
42
+ end
43
+
44
+ def test_external_classifying
45
+ lsi = Classifier::LSI.new
46
+ bayes = Classifier::Bayes.new 'Dog', 'Cat', 'Bird'
47
+ lsi.add_item @str1, "Dog" ; bayes.train_dog @str1
48
+ lsi.add_item @str2, "Dog" ; bayes.train_dog @str2
49
+ lsi.add_item @str3, "Cat" ; bayes.train_cat @str3
50
+ lsi.add_item @str4, "Cat" ; bayes.train_cat @str4
51
+ lsi.add_item @str5, "Bird" ; bayes.train_bird @str5
52
+
53
+ # We're talking about dogs. Even though the text matches the corpus on
54
+ # cats better. Dogs have more semantic weight than cats. So bayes
55
+ # will fail here, but the LSI recognizes content.
56
+ tricky_case = "This text revolves around dogs."
57
+ assert_equal "Dog", lsi.classify( tricky_case )
58
+ assert_not_equal "Dog", bayes.classify( tricky_case )
59
+ end
60
+
61
+ def test_recategorize_interface
62
+ lsi = Classifier::LSI.new
63
+ lsi.add_item @str1, "Dog"
64
+ lsi.add_item @str2, "Dog"
65
+ lsi.add_item @str3, "Cat"
66
+ lsi.add_item @str4, "Cat"
67
+ lsi.add_item @str5, "Bird"
68
+
69
+ tricky_case = "This text revolves around dogs."
70
+ assert_equal "Dog", lsi.classify( tricky_case )
71
+
72
+ # Recategorize as needed.
73
+ lsi.categories_for(@str1).clear.push "Cow"
74
+ lsi.categories_for(@str2).clear.push "Cow"
75
+
76
+ assert !lsi.needs_rebuild?
77
+ assert_equal "Cow", lsi.classify( tricky_case )
78
+ end
79
+
80
+ def test_search
81
+ lsi = Classifier::LSI.new
82
+ [@str1, @str2, @str3, @str4, @str5].each { |x| lsi << x }
83
+
84
+ # Searching by content and text, note that @str2 comes up first, because
85
+ # both "dog" and "involve" are present. But, the next match is @str1 instead
86
+ # of @str4, because "dog" carries more weight than involves.
87
+ assert_equal( [@str2, @str1, @str4, @str5, @str3],
88
+ lsi.search("dog involves", 100) )
89
+
90
+ # Keyword search shows how the space is mapped out in relation to
91
+ # dog when magnitude is remove. Note the relations. We move from dog
92
+ # through involve and then finally to other words.
93
+ assert_equal( [@str1, @str2, @str4, @str5, @str3],
94
+ lsi.search("dog", 5) )
95
+ end
96
+
97
+ def test_serialize_safe
98
+ lsi = Classifier::LSI.new
99
+ [@str1, @str2, @str3, @str4, @str5].each { |x| lsi << x }
100
+
101
+ lsi_md = Marshal.dump lsi
102
+ lsi_m = Marshal.load lsi_md
103
+
104
+ assert_equal lsi_m.search("cat", 3), lsi.search("cat", 3)
105
+ assert_equal lsi_m.find_related(@str1, 3), lsi.find_related(@str1, 3)
106
+ end
107
+
108
+ def test_keyword_search
109
+ lsi = Classifier::LSI.new
110
+ lsi.add_item @str1, "Dog"
111
+ lsi.add_item @str2, "Dog"
112
+ lsi.add_item @str3, "Cat"
113
+ lsi.add_item @str4, "Cat"
114
+ lsi.add_item @str5, "Bird"
115
+
116
+ assert_equal [:dog, :text, :deal], lsi.highest_ranked_stems(@str1)
117
+ end
118
+
119
+ def test_summary
120
+ assert_equal "This text involves dogs too [...] This text also involves cats", [@str1, @str2, @str3, @str4, @str5].join.summary(2)
121
+ end
122
+
123
+ end
@@ -0,0 +1,4 @@
1
+ $:.unshift(File.dirname(__FILE__) + '/../lib')
2
+
3
+ require 'test/unit'
4
+ require 'classifier'
metadata ADDED
@@ -0,0 +1,97 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: secobarbital-classifier
3
+ version: !ruby/object:Gem::Version
4
+ prerelease: false
5
+ segments:
6
+ - 1
7
+ - 3
8
+ - 1
9
+ - 1
10
+ version: 1.3.1.1
11
+ platform: ruby
12
+ authors:
13
+ - Lucas Carlson
14
+ - Seggy Umboh
15
+ autorequire: classifier
16
+ bindir: bin
17
+ cert_chain: []
18
+
19
+ date: 2010-03-17 00:00:00 -07:00
20
+ default_executable:
21
+ dependencies:
22
+ - !ruby/object:Gem::Dependency
23
+ name: stemmer
24
+ prerelease: false
25
+ requirement: &id001 !ruby/object:Gem::Requirement
26
+ requirements:
27
+ - - ">="
28
+ - !ruby/object:Gem::Version
29
+ segments:
30
+ - 1
31
+ - 0
32
+ - 0
33
+ version: 1.0.0
34
+ type: :runtime
35
+ version_requirements: *id001
36
+ description: " A general classifier module to allow Bayesian and other types of classifications.\n"
37
+ email:
38
+ - lucas@rufy.com
39
+ - seggy.umboh@gmail.com
40
+ executables: []
41
+
42
+ extensions: []
43
+
44
+ extra_rdoc_files: []
45
+
46
+ files:
47
+ - lib/classifier/bayes.rb
48
+ - lib/classifier/extensions/string.rb
49
+ - lib/classifier/extensions/vector.rb
50
+ - lib/classifier/extensions/vector_serialize.rb
51
+ - lib/classifier/extensions/word_hash.rb
52
+ - lib/classifier/lsi/content_node.rb
53
+ - lib/classifier/lsi/summary.rb
54
+ - lib/classifier/lsi/word_list.rb
55
+ - lib/classifier/lsi.rb
56
+ - lib/classifier.rb
57
+ - bin/bayes.rb
58
+ - bin/summarize.rb
59
+ - test/bayes/bayesian_test.rb
60
+ - test/extensions/word_hash_test.rb
61
+ - test/lsi/lsi_test.rb
62
+ - test/test_helper.rb
63
+ - LICENSE
64
+ - Rakefile
65
+ - README
66
+ has_rdoc: true
67
+ homepage: http://classifier.rufy.com/
68
+ licenses: []
69
+
70
+ post_install_message:
71
+ rdoc_options: []
72
+
73
+ require_paths:
74
+ - lib
75
+ required_ruby_version: !ruby/object:Gem::Requirement
76
+ requirements:
77
+ - - ">="
78
+ - !ruby/object:Gem::Version
79
+ segments:
80
+ - 0
81
+ version: "0"
82
+ required_rubygems_version: !ruby/object:Gem::Requirement
83
+ requirements:
84
+ - - ">="
85
+ - !ruby/object:Gem::Version
86
+ segments:
87
+ - 0
88
+ version: "0"
89
+ requirements:
90
+ - A porter-stemmer module to split word stems.
91
+ rubyforge_project:
92
+ rubygems_version: 1.3.6
93
+ signing_key:
94
+ specification_version: 3
95
+ summary: A general classifier module to allow Bayesian and other types of classifications.
96
+ test_files: []
97
+