logankoester-classifier 1.4.3
Sign up to get free protection for your applications and to get access to all the features.
- data/LICENSE +429 -0
- data/Manifest +19 -0
- data/README.rdoc +124 -0
- data/Rakefile +21 -0
- data/VERSION.yml +5 -0
- data/lib/classifier.rb +31 -0
- data/lib/classifier/base.rb +65 -0
- data/lib/classifier/bayes.rb +145 -0
- data/lib/classifier/extensions/vector.rb +100 -0
- data/lib/classifier/extensions/vector_serialize.rb +20 -0
- data/lib/classifier/lsi.rb +348 -0
- data/lib/classifier/lsi/content_node.rb +73 -0
- data/lib/classifier/lsi/summary.rb +31 -0
- data/lib/classifier/lsi/word_list.rb +36 -0
- data/lib/classifier/stopwords.rb +42 -0
- data/lib/classifier/stopwords/en +82 -0
- data/lib/classifier/stopwords/es +339 -0
- data/lib/classifier/stopwords/ru +161 -0
- data/lib/init.rb +1 -0
- data/tasks/test.rake +6 -0
- data/test/base_test.rb +17 -0
- data/test/bayes/bayesian_test.rb +68 -0
- data/test/lsi/lsi_test.rb +167 -0
- data/test/stopwords_test.rb +38 -0
- data/test/test_helper.rb +4 -0
- metadata +127 -0
@@ -0,0 +1,161 @@
|
|
1
|
+
# Russian stopwords
|
2
|
+
# http://snowball.tartarus.org/algorithms/russian/stop.txt
|
3
|
+
и # and
|
4
|
+
в # in/into
|
5
|
+
во # alternative form
|
6
|
+
не # not
|
7
|
+
что # what/that
|
8
|
+
он # he
|
9
|
+
на # on/onto
|
10
|
+
я # i
|
11
|
+
с # from
|
12
|
+
со # alternative form
|
13
|
+
как # how
|
14
|
+
а # milder form of `no' (but)
|
15
|
+
то # conjunction and form of `that'
|
16
|
+
все # all
|
17
|
+
она # she
|
18
|
+
так # so, thus
|
19
|
+
его # him
|
20
|
+
но # but
|
21
|
+
да # yes/and
|
22
|
+
ты # thou
|
23
|
+
к # towards, by
|
24
|
+
у # around, chez
|
25
|
+
же # intensifier particle
|
26
|
+
вы # you
|
27
|
+
за # beyond, behind
|
28
|
+
бы # conditional/subj. particle
|
29
|
+
по # up to, along
|
30
|
+
только # only
|
31
|
+
ее # her
|
32
|
+
мне # to me
|
33
|
+
было # it was
|
34
|
+
вот # here is/are, particle
|
35
|
+
от # away from
|
36
|
+
меня # me
|
37
|
+
еще # still, yet, more
|
38
|
+
нет # no, there isnt/arent
|
39
|
+
о # about
|
40
|
+
из # out of
|
41
|
+
ему # to him
|
42
|
+
теперь # now
|
43
|
+
когда # when
|
44
|
+
даже # even
|
45
|
+
ну # so, well
|
46
|
+
вдруг # suddenly
|
47
|
+
ли # interrogative particle
|
48
|
+
если # if
|
49
|
+
уже # already, but homonym of `narrower'
|
50
|
+
или # or
|
51
|
+
ни # neither
|
52
|
+
быть # to be
|
53
|
+
был # he was
|
54
|
+
него # prepositional form of его
|
55
|
+
до # up to
|
56
|
+
вас # you accusative
|
57
|
+
нибудь # indef. suffix preceded by hyphen
|
58
|
+
опять # again
|
59
|
+
уж # already, but homonym of `adder'
|
60
|
+
вам # to you
|
61
|
+
сказал # he said
|
62
|
+
ведь # particle `after all'
|
63
|
+
там # there
|
64
|
+
потом # then
|
65
|
+
себя # oneself
|
66
|
+
ничего # nothing
|
67
|
+
ей # to her
|
68
|
+
может # usually with `быть' as `maybe'
|
69
|
+
они # they
|
70
|
+
тут # here
|
71
|
+
где # where
|
72
|
+
есть # there is/are
|
73
|
+
надо # got to, must
|
74
|
+
ней # prepositional form of ей
|
75
|
+
для # for
|
76
|
+
мы # we
|
77
|
+
тебя # thee
|
78
|
+
их # them, their
|
79
|
+
чем # than
|
80
|
+
была # she was
|
81
|
+
сам # self
|
82
|
+
чтоб # in order to
|
83
|
+
без # without
|
84
|
+
будто # as if
|
85
|
+
человек # man, person, one
|
86
|
+
чего # genitive form of `what'
|
87
|
+
раз # once
|
88
|
+
тоже # also
|
89
|
+
себе # to oneself
|
90
|
+
под # beneath
|
91
|
+
жизнь # life
|
92
|
+
будет # will be
|
93
|
+
ж # short form of intensifer particle `же'
|
94
|
+
тогда # then
|
95
|
+
кто # who
|
96
|
+
этот # this
|
97
|
+
говорил # was saying
|
98
|
+
того # genitive form of `that'
|
99
|
+
потому # for that reason
|
100
|
+
этого # genitive form of `this'
|
101
|
+
какой # which
|
102
|
+
совсем # altogether
|
103
|
+
ним # prepositional form of `его', `они'
|
104
|
+
здесь # here
|
105
|
+
этом # prepositional form of `этот'
|
106
|
+
один # one
|
107
|
+
почти # almost
|
108
|
+
мой # my
|
109
|
+
тем # instrumental/dative plural of `тот', `то'
|
110
|
+
чтобы # full form of `in order that'
|
111
|
+
нее # her (acc.)
|
112
|
+
кажется # it seems
|
113
|
+
сейчас # now
|
114
|
+
были # they were
|
115
|
+
куда # where to
|
116
|
+
зачем # why
|
117
|
+
сказать # to say
|
118
|
+
всех # all (acc., gen. preposn. plural)
|
119
|
+
никогда # never
|
120
|
+
сегодня # today
|
121
|
+
можно # possible, one can
|
122
|
+
при # by
|
123
|
+
наконец # finally
|
124
|
+
два # two
|
125
|
+
об # alternative form of `о', about
|
126
|
+
другой # another
|
127
|
+
хоть # even
|
128
|
+
после # after
|
129
|
+
над # above
|
130
|
+
больше # more
|
131
|
+
тот # that one (masc.)
|
132
|
+
через # across, in
|
133
|
+
эти # these
|
134
|
+
нас # us
|
135
|
+
про # about
|
136
|
+
всего # in all, only, of all
|
137
|
+
них # prepositional form of `они' (they)
|
138
|
+
какая # which, feminine
|
139
|
+
много # lots
|
140
|
+
разве # interrogative particle
|
141
|
+
сказала # she said
|
142
|
+
три # three
|
143
|
+
эту # this, acc. fem. sing.
|
144
|
+
моя # my, feminine
|
145
|
+
впрочем # moreover, besides
|
146
|
+
хорошо # good
|
147
|
+
свою # ones own, acc. fem. sing.
|
148
|
+
этой # oblique form of `эта', fem. `this'
|
149
|
+
перед # in front of
|
150
|
+
иногда # sometimes
|
151
|
+
лучше # better
|
152
|
+
чуть # a little
|
153
|
+
том # preposn. form of `that one'
|
154
|
+
нельзя # one must not
|
155
|
+
такой # such a one
|
156
|
+
им # to them
|
157
|
+
более # more
|
158
|
+
всегда # always
|
159
|
+
конечно # of course
|
160
|
+
всю # acc. fem. sing of `all'
|
161
|
+
между # between
|
data/lib/init.rb
ADDED
@@ -0,0 +1 @@
|
|
1
|
+
require 'classifier'
|
data/tasks/test.rake
ADDED
data/test/base_test.rb
ADDED
@@ -0,0 +1,17 @@
|
|
1
|
+
require File.dirname(__FILE__) + '/test_helper'
|
2
|
+
class HelpersTest < Test::Unit::TestCase
|
3
|
+
|
4
|
+
def test_word_hash
|
5
|
+
c = Classifier::Base.new
|
6
|
+
hash = {'good'=>1, "!"=>1, 'hope'=>1, "'"=>1, "."=>1, 'love'=>1, 'word'=>1, 'them'=>1, 'test'=>1}
|
7
|
+
assert_equal hash, c.word_hash("here are some good words of test's. I hope you love them!")
|
8
|
+
end
|
9
|
+
|
10
|
+
|
11
|
+
def test_clean_word_hash
|
12
|
+
c = Classifier::Base.new
|
13
|
+
hash = {'good'=>1, 'word'=>1, 'hope'=>1, 'love'=>1, 'them'=>1, 'test'=>1}
|
14
|
+
assert_equal hash, c.clean_word_hash("here are some good words of test's. I hope you love them!")
|
15
|
+
end
|
16
|
+
|
17
|
+
end
|
@@ -0,0 +1,68 @@
|
|
1
|
+
# coding:utf-8
|
2
|
+
|
3
|
+
require File.dirname(__FILE__) + '/../test_helper'
|
4
|
+
|
5
|
+
class BayesianTest < Test::Unit::TestCase
|
6
|
+
def setup
|
7
|
+
@classifier = Classifier::Bayes.new :categories => ['Interesting', 'Uninteresting']
|
8
|
+
end
|
9
|
+
|
10
|
+
def test_good_training
|
11
|
+
assert_nothing_raised { @classifier.train_interesting "love" }
|
12
|
+
end
|
13
|
+
|
14
|
+
def test_bad_training
|
15
|
+
assert_raise(StandardError) { @classifier.train_no_category "words" }
|
16
|
+
end
|
17
|
+
|
18
|
+
def test_bad_method
|
19
|
+
assert_raise(NoMethodError) { @classifier.forget_everything_you_know "" }
|
20
|
+
end
|
21
|
+
|
22
|
+
def test_categories
|
23
|
+
assert_equal ['Interesting', 'Uninteresting'].sort, @classifier.categories.sort
|
24
|
+
end
|
25
|
+
|
26
|
+
def test_add_category
|
27
|
+
@classifier.add_category 'Test'
|
28
|
+
assert_equal ['Test', 'Interesting', 'Uninteresting'].sort, @classifier.categories.sort
|
29
|
+
end
|
30
|
+
|
31
|
+
def test_classification
|
32
|
+
@classifier.train_interesting "here are some good words. I hope you love them"
|
33
|
+
@classifier.train_uninteresting "here are some bad words, I hate you"
|
34
|
+
assert_equal 'Uninteresting', @classifier.classify("I hate bad words and you")
|
35
|
+
end
|
36
|
+
|
37
|
+
def test_regression_untrain_nil_fixnum
|
38
|
+
# if a word of the untraining text is not present on the category, a
|
39
|
+
# "TypeError: nil can't be coerced into Fixnum" is raised
|
40
|
+
@classifier.untrain_interesting "nothing"
|
41
|
+
end
|
42
|
+
|
43
|
+
def test_ru_classification
|
44
|
+
c = Classifier::Bayes.new :categories => ['Interesting', 'Uninteresting'], :language => "ru"
|
45
|
+
c.train_interesting "вот несколько хороших слов. Я надеюсь вам они понравились"
|
46
|
+
c.train_uninteresting "вот несколько плохих слов. Я тебя ненавижу"
|
47
|
+
assert_equal 'Uninteresting', c.classify("Я ненавижу плохие слова и тебя")
|
48
|
+
end
|
49
|
+
|
50
|
+
def test_case_insensitive
|
51
|
+
c = Classifier::Bayes.new :categories => [:good, :bad], :language => "ru"
|
52
|
+
c.train_good "Хорошо"
|
53
|
+
c.train_bad "Плохо"
|
54
|
+
|
55
|
+
assert_equal c.classifications("ХОРОШО"), c.classifications("хорошо")
|
56
|
+
assert_equal c.classifications("плОХО"), c.classifications("плохо")
|
57
|
+
end
|
58
|
+
|
59
|
+
def test_serialize
|
60
|
+
txt = "this can be serialized"
|
61
|
+
@classifier.train_interesting(txt)
|
62
|
+
@classifier.train_uninteresting("really uninteresting")
|
63
|
+
|
64
|
+
b2 = Marshal::load(Marshal::dump(@classifier))
|
65
|
+
assert_equal @classifier.classify(txt), b2.classify(txt)
|
66
|
+
end
|
67
|
+
|
68
|
+
end
|
@@ -0,0 +1,167 @@
|
|
1
|
+
require File.dirname(__FILE__) + '/../test_helper'
|
2
|
+
class LSITest < Test::Unit::TestCase
|
3
|
+
def setup
|
4
|
+
# we repeat principle words to help weight them.
|
5
|
+
# This test is rather delicate, since this system is mostly noise.
|
6
|
+
@str1 = "This text deals with dogs. Dogs."
|
7
|
+
@str2 = "This text involves dogs too. Dogs! "
|
8
|
+
@str3 = "This text revolves around cats. Cats."
|
9
|
+
@str4 = "This text also involves cats. Cats!"
|
10
|
+
@str5 = "This text involves birds. Birds."
|
11
|
+
@str6 = "Is it about dogs or birds?"
|
12
|
+
@str7 = "Is it about birds or cats?"
|
13
|
+
@str8 = "I would prefer a bird over thousand cats or dogs because birds are smaller."
|
14
|
+
end
|
15
|
+
|
16
|
+
def test_basic_indexing
|
17
|
+
lsi = Classifier::LSI.new
|
18
|
+
[@str1, @str2, @str3, @str4, @str5].each { |x| lsi << x }
|
19
|
+
assert ! lsi.needs_rebuild?
|
20
|
+
|
21
|
+
# note that the closest match to str1 is str2, even though it is not
|
22
|
+
# the closest text match.
|
23
|
+
assert_equal [@str2, @str5, @str3], lsi.find_related(@str1, 3)
|
24
|
+
end
|
25
|
+
|
26
|
+
def test_not_auto_rebuild
|
27
|
+
lsi = Classifier::LSI.new :auto_rebuild => false
|
28
|
+
lsi.add_item @str1, "Dog"
|
29
|
+
lsi.add_item @str2, "Dog"
|
30
|
+
assert lsi.needs_rebuild?
|
31
|
+
lsi.build_index
|
32
|
+
assert ! lsi.needs_rebuild?
|
33
|
+
end
|
34
|
+
|
35
|
+
def test_basic_categorizing_with_too_small_dataset
|
36
|
+
lsi = Classifier::LSI.new
|
37
|
+
lsi.add_item @str2, "Dog"
|
38
|
+
|
39
|
+
assert_equal nil, lsi.classify( @str1 )
|
40
|
+
assert_equal [], lsi.classify_multiple( @str3 )
|
41
|
+
end
|
42
|
+
|
43
|
+
def test_basic_categorizing
|
44
|
+
lsi = Classifier::LSI.new
|
45
|
+
lsi.add_item @str2, "Dog"
|
46
|
+
lsi.add_item @str3, "Cat"
|
47
|
+
lsi.add_item @str4, "Cat"
|
48
|
+
lsi.add_item @str5, "Bird"
|
49
|
+
|
50
|
+
assert_equal "Dog", lsi.classify( @str1 )
|
51
|
+
assert_equal "Cat", lsi.classify( @str3 )
|
52
|
+
assert_equal "Bird", lsi.classify( @str5 )
|
53
|
+
assert_equal "Dog", lsi.classify( @str6 )
|
54
|
+
assert_equal "Bird", lsi.classify( @str7 )
|
55
|
+
assert_equal "Bird", lsi.classify( @str8 )
|
56
|
+
end
|
57
|
+
|
58
|
+
def test_multiple_categorizing
|
59
|
+
lsi = Classifier::LSI.new
|
60
|
+
lsi.add_item @str1, "Dog"
|
61
|
+
lsi.add_item @str2, "Dog"
|
62
|
+
lsi.add_item @str3, "Cat"
|
63
|
+
lsi.add_item @str4, "Cat"
|
64
|
+
lsi.add_item @str5, "Bird"
|
65
|
+
|
66
|
+
assert_equal ["Dog", "Bird"], lsi.classify_multiple( @str6 )
|
67
|
+
assert_equal ["Cat", "Bird"], lsi.classify_multiple( @str7 )
|
68
|
+
assert_equal ["Bird"], lsi.classify_multiple( @str8 )
|
69
|
+
end
|
70
|
+
|
71
|
+
def test_multiple_categorizing_reverse
|
72
|
+
lsi = Classifier::LSI.new
|
73
|
+
lsi.add_item @str1, "Dog"
|
74
|
+
lsi.add_item @str3, "Cat"
|
75
|
+
lsi.add_item @str4, "Cat"
|
76
|
+
lsi.add_item @str6, "Dog", "Bird", "Flying"
|
77
|
+
lsi.add_item @str7, "Cat", "Bird"
|
78
|
+
lsi.add_item @str8, "Bird", "Dog", "Cat"
|
79
|
+
|
80
|
+
assert_equal ["Dog"], lsi.classify_multiple( @str2 )
|
81
|
+
assert_equal ["Cat", "Bird"], lsi.classify_multiple( @str5 )
|
82
|
+
|
83
|
+
# test with a word unknown alone
|
84
|
+
assert_equal "Bird", lsi.classify( "Bird!" )
|
85
|
+
assert_equal ["Bird", "Dog", "Cat"], lsi.classify_multiple( "Bird!" )
|
86
|
+
end
|
87
|
+
|
88
|
+
def test_external_classifying
|
89
|
+
lsi = Classifier::LSI.new
|
90
|
+
bayes = Classifier::Bayes.new :categories => ['Dog', 'Cat', 'Bird']
|
91
|
+
lsi.add_item @str1, "Dog" ; bayes.train_dog @str1
|
92
|
+
lsi.add_item @str2, "Dog" ; bayes.train_dog @str2
|
93
|
+
lsi.add_item @str3, "Cat" ; bayes.train_cat @str3
|
94
|
+
lsi.add_item @str4, "Cat" ; bayes.train_cat @str4
|
95
|
+
lsi.add_item @str5, "Bird" ; bayes.train_bird @str5
|
96
|
+
|
97
|
+
# We're talking about dogs. Even though the text matches the corpus on
|
98
|
+
# cats better. Dogs have more semantic weight than cats. So bayes
|
99
|
+
# will fail here, but the LSI recognizes content.
|
100
|
+
tricky_case = "This text revolves around dogs."
|
101
|
+
assert_equal "Dog", lsi.classify( tricky_case )
|
102
|
+
assert_not_equal "Dog", bayes.classify( tricky_case )
|
103
|
+
end
|
104
|
+
|
105
|
+
def test_recategorize_interface
|
106
|
+
lsi = Classifier::LSI.new
|
107
|
+
lsi.add_item @str1, "Dog"
|
108
|
+
lsi.add_item @str2, "Dog"
|
109
|
+
lsi.add_item @str3, "Cat"
|
110
|
+
lsi.add_item @str4, "Cat"
|
111
|
+
lsi.add_item @str5, "Bird"
|
112
|
+
|
113
|
+
tricky_case = "This text revolves around dogs."
|
114
|
+
assert_equal "Dog", lsi.classify( tricky_case )
|
115
|
+
|
116
|
+
# Recategorize as needed.
|
117
|
+
lsi.categories_for(@str1).clear.push "Cow"
|
118
|
+
lsi.categories_for(@str2).clear.push "Cow"
|
119
|
+
|
120
|
+
assert !lsi.needs_rebuild?
|
121
|
+
assert_equal "Cow", lsi.classify( tricky_case )
|
122
|
+
end
|
123
|
+
|
124
|
+
def test_search
|
125
|
+
lsi = Classifier::LSI.new
|
126
|
+
[@str1, @str2, @str3, @str4, @str5].each { |x| lsi << x }
|
127
|
+
|
128
|
+
# Searching by content and text, note that @str2 comes up first, because
|
129
|
+
# both "dog" and "involve" are present. But, the next match is @str1 instead
|
130
|
+
# of @str4, because "dog" carries more weight than involves.
|
131
|
+
assert_equal( [@str2, @str1, @str4, @str5, @str3],
|
132
|
+
lsi.search("dog involves", 100) )
|
133
|
+
|
134
|
+
# Keyword search shows how the space is mapped out in relation to
|
135
|
+
# dog when magnitude is remove. Note the relations. We move from dog
|
136
|
+
# through involve and then finally to other words.
|
137
|
+
assert_equal( [@str1, @str2, @str4, @str5, @str3],
|
138
|
+
lsi.search("dog", 5) )
|
139
|
+
end
|
140
|
+
|
141
|
+
def test_serialize_safe
|
142
|
+
lsi = Classifier::LSI.new
|
143
|
+
[@str1, @str2, @str3, @str4, @str5].each { |x| lsi << x }
|
144
|
+
|
145
|
+
lsi_md = Marshal.dump lsi
|
146
|
+
lsi_m = Marshal.load lsi_md
|
147
|
+
|
148
|
+
assert_equal lsi_m.search("cat", 3), lsi.search("cat", 3)
|
149
|
+
assert_equal lsi_m.find_related(@str1, 3), lsi.find_related(@str1, 3)
|
150
|
+
end
|
151
|
+
|
152
|
+
def test_keyword_search
|
153
|
+
lsi = Classifier::LSI.new
|
154
|
+
lsi.add_item @str1, "Dog"
|
155
|
+
lsi.add_item @str2, "Dog"
|
156
|
+
lsi.add_item @str3, "Cat"
|
157
|
+
lsi.add_item @str4, "Cat"
|
158
|
+
lsi.add_item @str5, "Bird"
|
159
|
+
|
160
|
+
assert_equal ['dog', 'text', 'deal'], lsi.highest_ranked_stems(@str1)
|
161
|
+
end
|
162
|
+
|
163
|
+
def test_summary
|
164
|
+
assert_equal "This text involves dogs too [...] This text also involves cats", [@str1, @str2, @str3, @str4, @str5].join.summary(2)
|
165
|
+
end
|
166
|
+
|
167
|
+
end
|
@@ -0,0 +1,38 @@
|
|
1
|
+
# coding:utf-8
|
2
|
+
require File.dirname(__FILE__) + '/test_helper'
|
3
|
+
require 'tempfile'
|
4
|
+
|
5
|
+
class StopWordsTest < Test::Unit::TestCase
|
6
|
+
def test_en
|
7
|
+
assert_equal 80, Classifier::StopWords.for('en').size
|
8
|
+
end
|
9
|
+
|
10
|
+
def test_ru
|
11
|
+
assert_equal 159, Classifier::StopWords.for('ru').size
|
12
|
+
end
|
13
|
+
|
14
|
+
def test_stopword_es
|
15
|
+
list = Classifier::StopWords.for('es')
|
16
|
+
assert list.include?('más')
|
17
|
+
end
|
18
|
+
|
19
|
+
def test_unknown
|
20
|
+
assert_equal [], Classifier::StopWords.for('_unknown_')
|
21
|
+
end
|
22
|
+
|
23
|
+
def setup
|
24
|
+
@tmp = nil
|
25
|
+
end
|
26
|
+
def teardown
|
27
|
+
Classifier::StopWords.reset
|
28
|
+
File.delete(@tmp) unless @tmp.nil?
|
29
|
+
end
|
30
|
+
|
31
|
+
def test_custom_lang_file
|
32
|
+
lang = 'xxyyzz'
|
33
|
+
@tmp = File.join(File.dirname(__FILE__), lang)
|
34
|
+
File.open(@tmp, 'w') { |f| f.puts "str1\nstr2" }
|
35
|
+
assert_equal ["str1", "str2"], Classifier::StopWords.for(lang,
|
36
|
+
File.dirname(@tmp))
|
37
|
+
end
|
38
|
+
end
|