logankoester-classifier 1.4.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/LICENSE +429 -0
- data/Manifest +19 -0
- data/README.rdoc +124 -0
- data/Rakefile +21 -0
- data/VERSION.yml +5 -0
- data/lib/classifier.rb +31 -0
- data/lib/classifier/base.rb +65 -0
- data/lib/classifier/bayes.rb +145 -0
- data/lib/classifier/extensions/vector.rb +100 -0
- data/lib/classifier/extensions/vector_serialize.rb +20 -0
- data/lib/classifier/lsi.rb +348 -0
- data/lib/classifier/lsi/content_node.rb +73 -0
- data/lib/classifier/lsi/summary.rb +31 -0
- data/lib/classifier/lsi/word_list.rb +36 -0
- data/lib/classifier/stopwords.rb +42 -0
- data/lib/classifier/stopwords/en +82 -0
- data/lib/classifier/stopwords/es +339 -0
- data/lib/classifier/stopwords/ru +161 -0
- data/lib/init.rb +1 -0
- data/tasks/test.rake +6 -0
- data/test/base_test.rb +17 -0
- data/test/bayes/bayesian_test.rb +68 -0
- data/test/lsi/lsi_test.rb +167 -0
- data/test/stopwords_test.rb +38 -0
- data/test/test_helper.rb +4 -0
- metadata +127 -0
@@ -0,0 +1,161 @@
|
|
1
|
+
# Russian stopwords
|
2
|
+
# http://snowball.tartarus.org/algorithms/russian/stop.txt
|
3
|
+
и # and
|
4
|
+
в # in/into
|
5
|
+
во # alternative form
|
6
|
+
не # not
|
7
|
+
что # what/that
|
8
|
+
он # he
|
9
|
+
на # on/onto
|
10
|
+
я # i
|
11
|
+
с # from
|
12
|
+
со # alternative form
|
13
|
+
как # how
|
14
|
+
а # milder form of `no' (but)
|
15
|
+
то # conjunction and form of `that'
|
16
|
+
все # all
|
17
|
+
она # she
|
18
|
+
так # so, thus
|
19
|
+
его # him
|
20
|
+
но # but
|
21
|
+
да # yes/and
|
22
|
+
ты # thou
|
23
|
+
к # towards, by
|
24
|
+
у # around, chez
|
25
|
+
же # intensifier particle
|
26
|
+
вы # you
|
27
|
+
за # beyond, behind
|
28
|
+
бы # conditional/subj. particle
|
29
|
+
по # up to, along
|
30
|
+
только # only
|
31
|
+
ее # her
|
32
|
+
мне # to me
|
33
|
+
было # it was
|
34
|
+
вот # here is/are, particle
|
35
|
+
от # away from
|
36
|
+
меня # me
|
37
|
+
еще # still, yet, more
|
38
|
+
нет # no, there isnt/arent
|
39
|
+
о # about
|
40
|
+
из # out of
|
41
|
+
ему # to him
|
42
|
+
теперь # now
|
43
|
+
когда # when
|
44
|
+
даже # even
|
45
|
+
ну # so, well
|
46
|
+
вдруг # suddenly
|
47
|
+
ли # interrogative particle
|
48
|
+
если # if
|
49
|
+
уже # already, but homonym of `narrower'
|
50
|
+
или # or
|
51
|
+
ни # neither
|
52
|
+
быть # to be
|
53
|
+
был # he was
|
54
|
+
него # prepositional form of его
|
55
|
+
до # up to
|
56
|
+
вас # you accusative
|
57
|
+
нибудь # indef. suffix preceded by hyphen
|
58
|
+
опять # again
|
59
|
+
уж # already, but homonym of `adder'
|
60
|
+
вам # to you
|
61
|
+
сказал # he said
|
62
|
+
ведь # particle `after all'
|
63
|
+
там # there
|
64
|
+
потом # then
|
65
|
+
себя # oneself
|
66
|
+
ничего # nothing
|
67
|
+
ей # to her
|
68
|
+
может # usually with `быть' as `maybe'
|
69
|
+
они # they
|
70
|
+
тут # here
|
71
|
+
где # where
|
72
|
+
есть # there is/are
|
73
|
+
надо # got to, must
|
74
|
+
ней # prepositional form of ей
|
75
|
+
для # for
|
76
|
+
мы # we
|
77
|
+
тебя # thee
|
78
|
+
их # them, their
|
79
|
+
чем # than
|
80
|
+
была # she was
|
81
|
+
сам # self
|
82
|
+
чтоб # in order to
|
83
|
+
без # without
|
84
|
+
будто # as if
|
85
|
+
человек # man, person, one
|
86
|
+
чего # genitive form of `what'
|
87
|
+
раз # once
|
88
|
+
тоже # also
|
89
|
+
себе # to oneself
|
90
|
+
под # beneath
|
91
|
+
жизнь # life
|
92
|
+
будет # will be
|
93
|
+
ж # short form of intensifer particle `же'
|
94
|
+
тогда # then
|
95
|
+
кто # who
|
96
|
+
этот # this
|
97
|
+
говорил # was saying
|
98
|
+
того # genitive form of `that'
|
99
|
+
потому # for that reason
|
100
|
+
этого # genitive form of `this'
|
101
|
+
какой # which
|
102
|
+
совсем # altogether
|
103
|
+
ним # prepositional form of `его', `они'
|
104
|
+
здесь # here
|
105
|
+
этом # prepositional form of `этот'
|
106
|
+
один # one
|
107
|
+
почти # almost
|
108
|
+
мой # my
|
109
|
+
тем # instrumental/dative plural of `тот', `то'
|
110
|
+
чтобы # full form of `in order that'
|
111
|
+
нее # her (acc.)
|
112
|
+
кажется # it seems
|
113
|
+
сейчас # now
|
114
|
+
были # they were
|
115
|
+
куда # where to
|
116
|
+
зачем # why
|
117
|
+
сказать # to say
|
118
|
+
всех # all (acc., gen. preposn. plural)
|
119
|
+
никогда # never
|
120
|
+
сегодня # today
|
121
|
+
можно # possible, one can
|
122
|
+
при # by
|
123
|
+
наконец # finally
|
124
|
+
два # two
|
125
|
+
об # alternative form of `о', about
|
126
|
+
другой # another
|
127
|
+
хоть # even
|
128
|
+
после # after
|
129
|
+
над # above
|
130
|
+
больше # more
|
131
|
+
тот # that one (masc.)
|
132
|
+
через # across, in
|
133
|
+
эти # these
|
134
|
+
нас # us
|
135
|
+
про # about
|
136
|
+
всего # in all, only, of all
|
137
|
+
них # prepositional form of `они' (they)
|
138
|
+
какая # which, feminine
|
139
|
+
много # lots
|
140
|
+
разве # interrogative particle
|
141
|
+
сказала # she said
|
142
|
+
три # three
|
143
|
+
эту # this, acc. fem. sing.
|
144
|
+
моя # my, feminine
|
145
|
+
впрочем # moreover, besides
|
146
|
+
хорошо # good
|
147
|
+
свою # ones own, acc. fem. sing.
|
148
|
+
этой # oblique form of `эта', fem. `this'
|
149
|
+
перед # in front of
|
150
|
+
иногда # sometimes
|
151
|
+
лучше # better
|
152
|
+
чуть # a little
|
153
|
+
том # preposn. form of `that one'
|
154
|
+
нельзя # one must not
|
155
|
+
такой # such a one
|
156
|
+
им # to them
|
157
|
+
более # more
|
158
|
+
всегда # always
|
159
|
+
конечно # of course
|
160
|
+
всю # acc. fem. sing of `all'
|
161
|
+
между # between
|
data/lib/init.rb
ADDED
@@ -0,0 +1 @@
|
|
1
|
+
require 'classifier'
|
data/tasks/test.rake
ADDED
data/test/base_test.rb
ADDED
@@ -0,0 +1,17 @@
|
|
1
|
+
require File.dirname(__FILE__) + '/test_helper'
|
2
|
+
class HelpersTest < Test::Unit::TestCase
|
3
|
+
|
4
|
+
def test_word_hash
|
5
|
+
c = Classifier::Base.new
|
6
|
+
hash = {'good'=>1, "!"=>1, 'hope'=>1, "'"=>1, "."=>1, 'love'=>1, 'word'=>1, 'them'=>1, 'test'=>1}
|
7
|
+
assert_equal hash, c.word_hash("here are some good words of test's. I hope you love them!")
|
8
|
+
end
|
9
|
+
|
10
|
+
|
11
|
+
def test_clean_word_hash
|
12
|
+
c = Classifier::Base.new
|
13
|
+
hash = {'good'=>1, 'word'=>1, 'hope'=>1, 'love'=>1, 'them'=>1, 'test'=>1}
|
14
|
+
assert_equal hash, c.clean_word_hash("here are some good words of test's. I hope you love them!")
|
15
|
+
end
|
16
|
+
|
17
|
+
end
|
@@ -0,0 +1,68 @@
|
|
1
|
+
# coding:utf-8
|
2
|
+
|
3
|
+
require File.dirname(__FILE__) + '/../test_helper'
|
4
|
+
|
5
|
+
class BayesianTest < Test::Unit::TestCase
|
6
|
+
def setup
|
7
|
+
@classifier = Classifier::Bayes.new :categories => ['Interesting', 'Uninteresting']
|
8
|
+
end
|
9
|
+
|
10
|
+
def test_good_training
|
11
|
+
assert_nothing_raised { @classifier.train_interesting "love" }
|
12
|
+
end
|
13
|
+
|
14
|
+
def test_bad_training
|
15
|
+
assert_raise(StandardError) { @classifier.train_no_category "words" }
|
16
|
+
end
|
17
|
+
|
18
|
+
def test_bad_method
|
19
|
+
assert_raise(NoMethodError) { @classifier.forget_everything_you_know "" }
|
20
|
+
end
|
21
|
+
|
22
|
+
def test_categories
|
23
|
+
assert_equal ['Interesting', 'Uninteresting'].sort, @classifier.categories.sort
|
24
|
+
end
|
25
|
+
|
26
|
+
def test_add_category
|
27
|
+
@classifier.add_category 'Test'
|
28
|
+
assert_equal ['Test', 'Interesting', 'Uninteresting'].sort, @classifier.categories.sort
|
29
|
+
end
|
30
|
+
|
31
|
+
def test_classification
|
32
|
+
@classifier.train_interesting "here are some good words. I hope you love them"
|
33
|
+
@classifier.train_uninteresting "here are some bad words, I hate you"
|
34
|
+
assert_equal 'Uninteresting', @classifier.classify("I hate bad words and you")
|
35
|
+
end
|
36
|
+
|
37
|
+
def test_regression_untrain_nil_fixnum
|
38
|
+
# if a word of the untraining text is not present on the category, a
|
39
|
+
# "TypeError: nil can't be coerced into Fixnum" is raised
|
40
|
+
@classifier.untrain_interesting "nothing"
|
41
|
+
end
|
42
|
+
|
43
|
+
def test_ru_classification
|
44
|
+
c = Classifier::Bayes.new :categories => ['Interesting', 'Uninteresting'], :language => "ru"
|
45
|
+
c.train_interesting "вот несколько хороших слов. Я надеюсь вам они понравились"
|
46
|
+
c.train_uninteresting "вот несколько плохих слов. Я тебя ненавижу"
|
47
|
+
assert_equal 'Uninteresting', c.classify("Я ненавижу плохие слова и тебя")
|
48
|
+
end
|
49
|
+
|
50
|
+
def test_case_insensitive
|
51
|
+
c = Classifier::Bayes.new :categories => [:good, :bad], :language => "ru"
|
52
|
+
c.train_good "Хорошо"
|
53
|
+
c.train_bad "Плохо"
|
54
|
+
|
55
|
+
assert_equal c.classifications("ХОРОШО"), c.classifications("хорошо")
|
56
|
+
assert_equal c.classifications("плОХО"), c.classifications("плохо")
|
57
|
+
end
|
58
|
+
|
59
|
+
def test_serialize
|
60
|
+
txt = "this can be serialized"
|
61
|
+
@classifier.train_interesting(txt)
|
62
|
+
@classifier.train_uninteresting("really uninteresting")
|
63
|
+
|
64
|
+
b2 = Marshal::load(Marshal::dump(@classifier))
|
65
|
+
assert_equal @classifier.classify(txt), b2.classify(txt)
|
66
|
+
end
|
67
|
+
|
68
|
+
end
|
@@ -0,0 +1,167 @@
|
|
1
|
+
require File.dirname(__FILE__) + '/../test_helper'
|
2
|
+
class LSITest < Test::Unit::TestCase
|
3
|
+
def setup
|
4
|
+
# we repeat principle words to help weight them.
|
5
|
+
# This test is rather delicate, since this system is mostly noise.
|
6
|
+
@str1 = "This text deals with dogs. Dogs."
|
7
|
+
@str2 = "This text involves dogs too. Dogs! "
|
8
|
+
@str3 = "This text revolves around cats. Cats."
|
9
|
+
@str4 = "This text also involves cats. Cats!"
|
10
|
+
@str5 = "This text involves birds. Birds."
|
11
|
+
@str6 = "Is it about dogs or birds?"
|
12
|
+
@str7 = "Is it about birds or cats?"
|
13
|
+
@str8 = "I would prefer a bird over thousand cats or dogs because birds are smaller."
|
14
|
+
end
|
15
|
+
|
16
|
+
def test_basic_indexing
|
17
|
+
lsi = Classifier::LSI.new
|
18
|
+
[@str1, @str2, @str3, @str4, @str5].each { |x| lsi << x }
|
19
|
+
assert ! lsi.needs_rebuild?
|
20
|
+
|
21
|
+
# note that the closest match to str1 is str2, even though it is not
|
22
|
+
# the closest text match.
|
23
|
+
assert_equal [@str2, @str5, @str3], lsi.find_related(@str1, 3)
|
24
|
+
end
|
25
|
+
|
26
|
+
def test_not_auto_rebuild
|
27
|
+
lsi = Classifier::LSI.new :auto_rebuild => false
|
28
|
+
lsi.add_item @str1, "Dog"
|
29
|
+
lsi.add_item @str2, "Dog"
|
30
|
+
assert lsi.needs_rebuild?
|
31
|
+
lsi.build_index
|
32
|
+
assert ! lsi.needs_rebuild?
|
33
|
+
end
|
34
|
+
|
35
|
+
def test_basic_categorizing_with_too_small_dataset
|
36
|
+
lsi = Classifier::LSI.new
|
37
|
+
lsi.add_item @str2, "Dog"
|
38
|
+
|
39
|
+
assert_equal nil, lsi.classify( @str1 )
|
40
|
+
assert_equal [], lsi.classify_multiple( @str3 )
|
41
|
+
end
|
42
|
+
|
43
|
+
def test_basic_categorizing
|
44
|
+
lsi = Classifier::LSI.new
|
45
|
+
lsi.add_item @str2, "Dog"
|
46
|
+
lsi.add_item @str3, "Cat"
|
47
|
+
lsi.add_item @str4, "Cat"
|
48
|
+
lsi.add_item @str5, "Bird"
|
49
|
+
|
50
|
+
assert_equal "Dog", lsi.classify( @str1 )
|
51
|
+
assert_equal "Cat", lsi.classify( @str3 )
|
52
|
+
assert_equal "Bird", lsi.classify( @str5 )
|
53
|
+
assert_equal "Dog", lsi.classify( @str6 )
|
54
|
+
assert_equal "Bird", lsi.classify( @str7 )
|
55
|
+
assert_equal "Bird", lsi.classify( @str8 )
|
56
|
+
end
|
57
|
+
|
58
|
+
def test_multiple_categorizing
|
59
|
+
lsi = Classifier::LSI.new
|
60
|
+
lsi.add_item @str1, "Dog"
|
61
|
+
lsi.add_item @str2, "Dog"
|
62
|
+
lsi.add_item @str3, "Cat"
|
63
|
+
lsi.add_item @str4, "Cat"
|
64
|
+
lsi.add_item @str5, "Bird"
|
65
|
+
|
66
|
+
assert_equal ["Dog", "Bird"], lsi.classify_multiple( @str6 )
|
67
|
+
assert_equal ["Cat", "Bird"], lsi.classify_multiple( @str7 )
|
68
|
+
assert_equal ["Bird"], lsi.classify_multiple( @str8 )
|
69
|
+
end
|
70
|
+
|
71
|
+
def test_multiple_categorizing_reverse
|
72
|
+
lsi = Classifier::LSI.new
|
73
|
+
lsi.add_item @str1, "Dog"
|
74
|
+
lsi.add_item @str3, "Cat"
|
75
|
+
lsi.add_item @str4, "Cat"
|
76
|
+
lsi.add_item @str6, "Dog", "Bird", "Flying"
|
77
|
+
lsi.add_item @str7, "Cat", "Bird"
|
78
|
+
lsi.add_item @str8, "Bird", "Dog", "Cat"
|
79
|
+
|
80
|
+
assert_equal ["Dog"], lsi.classify_multiple( @str2 )
|
81
|
+
assert_equal ["Cat", "Bird"], lsi.classify_multiple( @str5 )
|
82
|
+
|
83
|
+
# test with a word unknown alone
|
84
|
+
assert_equal "Bird", lsi.classify( "Bird!" )
|
85
|
+
assert_equal ["Bird", "Dog", "Cat"], lsi.classify_multiple( "Bird!" )
|
86
|
+
end
|
87
|
+
|
88
|
+
def test_external_classifying
|
89
|
+
lsi = Classifier::LSI.new
|
90
|
+
bayes = Classifier::Bayes.new :categories => ['Dog', 'Cat', 'Bird']
|
91
|
+
lsi.add_item @str1, "Dog" ; bayes.train_dog @str1
|
92
|
+
lsi.add_item @str2, "Dog" ; bayes.train_dog @str2
|
93
|
+
lsi.add_item @str3, "Cat" ; bayes.train_cat @str3
|
94
|
+
lsi.add_item @str4, "Cat" ; bayes.train_cat @str4
|
95
|
+
lsi.add_item @str5, "Bird" ; bayes.train_bird @str5
|
96
|
+
|
97
|
+
# We're talking about dogs. Even though the text matches the corpus on
|
98
|
+
# cats better. Dogs have more semantic weight than cats. So bayes
|
99
|
+
# will fail here, but the LSI recognizes content.
|
100
|
+
tricky_case = "This text revolves around dogs."
|
101
|
+
assert_equal "Dog", lsi.classify( tricky_case )
|
102
|
+
assert_not_equal "Dog", bayes.classify( tricky_case )
|
103
|
+
end
|
104
|
+
|
105
|
+
def test_recategorize_interface
|
106
|
+
lsi = Classifier::LSI.new
|
107
|
+
lsi.add_item @str1, "Dog"
|
108
|
+
lsi.add_item @str2, "Dog"
|
109
|
+
lsi.add_item @str3, "Cat"
|
110
|
+
lsi.add_item @str4, "Cat"
|
111
|
+
lsi.add_item @str5, "Bird"
|
112
|
+
|
113
|
+
tricky_case = "This text revolves around dogs."
|
114
|
+
assert_equal "Dog", lsi.classify( tricky_case )
|
115
|
+
|
116
|
+
# Recategorize as needed.
|
117
|
+
lsi.categories_for(@str1).clear.push "Cow"
|
118
|
+
lsi.categories_for(@str2).clear.push "Cow"
|
119
|
+
|
120
|
+
assert !lsi.needs_rebuild?
|
121
|
+
assert_equal "Cow", lsi.classify( tricky_case )
|
122
|
+
end
|
123
|
+
|
124
|
+
def test_search
|
125
|
+
lsi = Classifier::LSI.new
|
126
|
+
[@str1, @str2, @str3, @str4, @str5].each { |x| lsi << x }
|
127
|
+
|
128
|
+
# Searching by content and text, note that @str2 comes up first, because
|
129
|
+
# both "dog" and "involve" are present. But, the next match is @str1 instead
|
130
|
+
# of @str4, because "dog" carries more weight than involves.
|
131
|
+
assert_equal( [@str2, @str1, @str4, @str5, @str3],
|
132
|
+
lsi.search("dog involves", 100) )
|
133
|
+
|
134
|
+
# Keyword search shows how the space is mapped out in relation to
|
135
|
+
# dog when magnitude is remove. Note the relations. We move from dog
|
136
|
+
# through involve and then finally to other words.
|
137
|
+
assert_equal( [@str1, @str2, @str4, @str5, @str3],
|
138
|
+
lsi.search("dog", 5) )
|
139
|
+
end
|
140
|
+
|
141
|
+
def test_serialize_safe
|
142
|
+
lsi = Classifier::LSI.new
|
143
|
+
[@str1, @str2, @str3, @str4, @str5].each { |x| lsi << x }
|
144
|
+
|
145
|
+
lsi_md = Marshal.dump lsi
|
146
|
+
lsi_m = Marshal.load lsi_md
|
147
|
+
|
148
|
+
assert_equal lsi_m.search("cat", 3), lsi.search("cat", 3)
|
149
|
+
assert_equal lsi_m.find_related(@str1, 3), lsi.find_related(@str1, 3)
|
150
|
+
end
|
151
|
+
|
152
|
+
def test_keyword_search
|
153
|
+
lsi = Classifier::LSI.new
|
154
|
+
lsi.add_item @str1, "Dog"
|
155
|
+
lsi.add_item @str2, "Dog"
|
156
|
+
lsi.add_item @str3, "Cat"
|
157
|
+
lsi.add_item @str4, "Cat"
|
158
|
+
lsi.add_item @str5, "Bird"
|
159
|
+
|
160
|
+
assert_equal ['dog', 'text', 'deal'], lsi.highest_ranked_stems(@str1)
|
161
|
+
end
|
162
|
+
|
163
|
+
def test_summary
|
164
|
+
assert_equal "This text involves dogs too [...] This text also involves cats", [@str1, @str2, @str3, @str4, @str5].join.summary(2)
|
165
|
+
end
|
166
|
+
|
167
|
+
end
|
@@ -0,0 +1,38 @@
|
|
1
|
+
# coding:utf-8
|
2
|
+
require File.dirname(__FILE__) + '/test_helper'
|
3
|
+
require 'tempfile'
|
4
|
+
|
5
|
+
class StopWordsTest < Test::Unit::TestCase
|
6
|
+
def test_en
|
7
|
+
assert_equal 80, Classifier::StopWords.for('en').size
|
8
|
+
end
|
9
|
+
|
10
|
+
def test_ru
|
11
|
+
assert_equal 159, Classifier::StopWords.for('ru').size
|
12
|
+
end
|
13
|
+
|
14
|
+
def test_stopword_es
|
15
|
+
list = Classifier::StopWords.for('es')
|
16
|
+
assert list.include?('más')
|
17
|
+
end
|
18
|
+
|
19
|
+
def test_unknown
|
20
|
+
assert_equal [], Classifier::StopWords.for('_unknown_')
|
21
|
+
end
|
22
|
+
|
23
|
+
def setup
|
24
|
+
@tmp = nil
|
25
|
+
end
|
26
|
+
def teardown
|
27
|
+
Classifier::StopWords.reset
|
28
|
+
File.delete(@tmp) unless @tmp.nil?
|
29
|
+
end
|
30
|
+
|
31
|
+
def test_custom_lang_file
|
32
|
+
lang = 'xxyyzz'
|
33
|
+
@tmp = File.join(File.dirname(__FILE__), lang)
|
34
|
+
File.open(@tmp, 'w') { |f| f.puts "str1\nstr2" }
|
35
|
+
assert_equal ["str1", "str2"], Classifier::StopWords.for(lang,
|
36
|
+
File.dirname(@tmp))
|
37
|
+
end
|
38
|
+
end
|