logankoester-classifier 1.4.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,161 @@
1
+ # Russian stopwords
2
+ # http://snowball.tartarus.org/algorithms/russian/stop.txt
3
+ и # and
4
+ в # in/into
5
+ во # alternative form
6
+ не # not
7
+ что # what/that
8
+ он # he
9
+ на # on/onto
10
+ я # i
11
+ с # from
12
+ со # alternative form
13
+ как # how
14
+ а # milder form of `no' (but)
15
+ то # conjunction and form of `that'
16
+ все # all
17
+ она # she
18
+ так # so, thus
19
+ его # him
20
+ но # but
21
+ да # yes/and
22
+ ты # thou
23
+ к # towards, by
24
+ у # around, chez
25
+ же # intensifier particle
26
+ вы # you
27
+ за # beyond, behind
28
+ бы # conditional/subj. particle
29
+ по # up to, along
30
+ только # only
31
+ ее # her
32
+ мне # to me
33
+ было # it was
34
+ вот # here is/are, particle
35
+ от # away from
36
+ меня # me
37
+ еще # still, yet, more
38
+ нет # no, there isnt/arent
39
+ о # about
40
+ из # out of
41
+ ему # to him
42
+ теперь # now
43
+ когда # when
44
+ даже # even
45
+ ну # so, well
46
+ вдруг # suddenly
47
+ ли # interrogative particle
48
+ если # if
49
+ уже # already, but homonym of `narrower'
50
+ или # or
51
+ ни # neither
52
+ быть # to be
53
+ был # he was
54
+ него # prepositional form of его
55
+ до # up to
56
+ вас # you accusative
57
+ нибудь # indef. suffix preceded by hyphen
58
+ опять # again
59
+ уж # already, but homonym of `adder'
60
+ вам # to you
61
+ сказал # he said
62
+ ведь # particle `after all'
63
+ там # there
64
+ потом # then
65
+ себя # oneself
66
+ ничего # nothing
67
+ ей # to her
68
+ может # usually with `быть' as `maybe'
69
+ они # they
70
+ тут # here
71
+ где # where
72
+ есть # there is/are
73
+ надо # got to, must
74
+ ней # prepositional form of ей
75
+ для # for
76
+ мы # we
77
+ тебя # thee
78
+ их # them, their
79
+ чем # than
80
+ была # she was
81
+ сам # self
82
+ чтоб # in order to
83
+ без # without
84
+ будто # as if
85
+ человек # man, person, one
86
+ чего # genitive form of `what'
87
+ раз # once
88
+ тоже # also
89
+ себе # to oneself
90
+ под # beneath
91
+ жизнь # life
92
+ будет # will be
93
+ ж # short form of intensifer particle `же'
94
+ тогда # then
95
+ кто # who
96
+ этот # this
97
+ говорил # was saying
98
+ того # genitive form of `that'
99
+ потому # for that reason
100
+ этого # genitive form of `this'
101
+ какой # which
102
+ совсем # altogether
103
+ ним # prepositional form of `его', `они'
104
+ здесь # here
105
+ этом # prepositional form of `этот'
106
+ один # one
107
+ почти # almost
108
+ мой # my
109
+ тем # instrumental/dative plural of `тот', `то'
110
+ чтобы # full form of `in order that'
111
+ нее # her (acc.)
112
+ кажется # it seems
113
+ сейчас # now
114
+ были # they were
115
+ куда # where to
116
+ зачем # why
117
+ сказать # to say
118
+ всех # all (acc., gen. preposn. plural)
119
+ никогда # never
120
+ сегодня # today
121
+ можно # possible, one can
122
+ при # by
123
+ наконец # finally
124
+ два # two
125
+ об # alternative form of `о', about
126
+ другой # another
127
+ хоть # even
128
+ после # after
129
+ над # above
130
+ больше # more
131
+ тот # that one (masc.)
132
+ через # across, in
133
+ эти # these
134
+ нас # us
135
+ про # about
136
+ всего # in all, only, of all
137
+ них # prepositional form of `они' (they)
138
+ какая # which, feminine
139
+ много # lots
140
+ разве # interrogative particle
141
+ сказала # she said
142
+ три # three
143
+ эту # this, acc. fem. sing.
144
+ моя # my, feminine
145
+ впрочем # moreover, besides
146
+ хорошо # good
147
+ свою # ones own, acc. fem. sing.
148
+ этой # oblique form of `эта', fem. `this'
149
+ перед # in front of
150
+ иногда # sometimes
151
+ лучше # better
152
+ чуть # a little
153
+ том # preposn. form of `that one'
154
+ нельзя # one must not
155
+ такой # such a one
156
+ им # to them
157
+ более # more
158
+ всегда # always
159
+ конечно # of course
160
+ всю # acc. fem. sing of `all'
161
+ между # between
data/lib/init.rb ADDED
@@ -0,0 +1 @@
1
+ require 'classifier'
data/tasks/test.rake ADDED
@@ -0,0 +1,6 @@
1
+ require 'rake/testtask'
2
+
3
+
4
+ Rake::TestTask.new do |t|
5
+ t.test_files = FileList['test/**/*_test.rb']
6
+ end
data/test/base_test.rb ADDED
@@ -0,0 +1,17 @@
1
+ require File.dirname(__FILE__) + '/test_helper'
2
+ class HelpersTest < Test::Unit::TestCase
3
+
4
+ def test_word_hash
5
+ c = Classifier::Base.new
6
+ hash = {'good'=>1, "!"=>1, 'hope'=>1, "'"=>1, "."=>1, 'love'=>1, 'word'=>1, 'them'=>1, 'test'=>1}
7
+ assert_equal hash, c.word_hash("here are some good words of test's. I hope you love them!")
8
+ end
9
+
10
+
11
+ def test_clean_word_hash
12
+ c = Classifier::Base.new
13
+ hash = {'good'=>1, 'word'=>1, 'hope'=>1, 'love'=>1, 'them'=>1, 'test'=>1}
14
+ assert_equal hash, c.clean_word_hash("here are some good words of test's. I hope you love them!")
15
+ end
16
+
17
+ end
@@ -0,0 +1,68 @@
1
+ # coding:utf-8
2
+
3
+ require File.dirname(__FILE__) + '/../test_helper'
4
+
5
+ class BayesianTest < Test::Unit::TestCase
6
+ def setup
7
+ @classifier = Classifier::Bayes.new :categories => ['Interesting', 'Uninteresting']
8
+ end
9
+
10
+ def test_good_training
11
+ assert_nothing_raised { @classifier.train_interesting "love" }
12
+ end
13
+
14
+ def test_bad_training
15
+ assert_raise(StandardError) { @classifier.train_no_category "words" }
16
+ end
17
+
18
+ def test_bad_method
19
+ assert_raise(NoMethodError) { @classifier.forget_everything_you_know "" }
20
+ end
21
+
22
+ def test_categories
23
+ assert_equal ['Interesting', 'Uninteresting'].sort, @classifier.categories.sort
24
+ end
25
+
26
+ def test_add_category
27
+ @classifier.add_category 'Test'
28
+ assert_equal ['Test', 'Interesting', 'Uninteresting'].sort, @classifier.categories.sort
29
+ end
30
+
31
+ def test_classification
32
+ @classifier.train_interesting "here are some good words. I hope you love them"
33
+ @classifier.train_uninteresting "here are some bad words, I hate you"
34
+ assert_equal 'Uninteresting', @classifier.classify("I hate bad words and you")
35
+ end
36
+
37
+ def test_regression_untrain_nil_fixnum
38
+ # if a word of the untraining text is not present on the category, a
39
+ # "TypeError: nil can't be coerced into Fixnum" is raised
40
+ @classifier.untrain_interesting "nothing"
41
+ end
42
+
43
+ def test_ru_classification
44
+ c = Classifier::Bayes.new :categories => ['Interesting', 'Uninteresting'], :language => "ru"
45
+ c.train_interesting "вот несколько хороших слов. Я надеюсь вам они понравились"
46
+ c.train_uninteresting "вот несколько плохих слов. Я тебя ненавижу"
47
+ assert_equal 'Uninteresting', c.classify("Я ненавижу плохие слова и тебя")
48
+ end
49
+
50
+ def test_case_insensitive
51
+ c = Classifier::Bayes.new :categories => [:good, :bad], :language => "ru"
52
+ c.train_good "Хорошо"
53
+ c.train_bad "Плохо"
54
+
55
+ assert_equal c.classifications("ХОРОШО"), c.classifications("хорошо")
56
+ assert_equal c.classifications("плОХО"), c.classifications("плохо")
57
+ end
58
+
59
+ def test_serialize
60
+ txt = "this can be serialized"
61
+ @classifier.train_interesting(txt)
62
+ @classifier.train_uninteresting("really uninteresting")
63
+
64
+ b2 = Marshal::load(Marshal::dump(@classifier))
65
+ assert_equal @classifier.classify(txt), b2.classify(txt)
66
+ end
67
+
68
+ end
@@ -0,0 +1,167 @@
1
+ require File.dirname(__FILE__) + '/../test_helper'
2
+ class LSITest < Test::Unit::TestCase
3
+ def setup
4
+ # we repeat principle words to help weight them.
5
+ # This test is rather delicate, since this system is mostly noise.
6
+ @str1 = "This text deals with dogs. Dogs."
7
+ @str2 = "This text involves dogs too. Dogs! "
8
+ @str3 = "This text revolves around cats. Cats."
9
+ @str4 = "This text also involves cats. Cats!"
10
+ @str5 = "This text involves birds. Birds."
11
+ @str6 = "Is it about dogs or birds?"
12
+ @str7 = "Is it about birds or cats?"
13
+ @str8 = "I would prefer a bird over thousand cats or dogs because birds are smaller."
14
+ end
15
+
16
+ def test_basic_indexing
17
+ lsi = Classifier::LSI.new
18
+ [@str1, @str2, @str3, @str4, @str5].each { |x| lsi << x }
19
+ assert ! lsi.needs_rebuild?
20
+
21
+ # note that the closest match to str1 is str2, even though it is not
22
+ # the closest text match.
23
+ assert_equal [@str2, @str5, @str3], lsi.find_related(@str1, 3)
24
+ end
25
+
26
+ def test_not_auto_rebuild
27
+ lsi = Classifier::LSI.new :auto_rebuild => false
28
+ lsi.add_item @str1, "Dog"
29
+ lsi.add_item @str2, "Dog"
30
+ assert lsi.needs_rebuild?
31
+ lsi.build_index
32
+ assert ! lsi.needs_rebuild?
33
+ end
34
+
35
+ def test_basic_categorizing_with_too_small_dataset
36
+ lsi = Classifier::LSI.new
37
+ lsi.add_item @str2, "Dog"
38
+
39
+ assert_equal nil, lsi.classify( @str1 )
40
+ assert_equal [], lsi.classify_multiple( @str3 )
41
+ end
42
+
43
+ def test_basic_categorizing
44
+ lsi = Classifier::LSI.new
45
+ lsi.add_item @str2, "Dog"
46
+ lsi.add_item @str3, "Cat"
47
+ lsi.add_item @str4, "Cat"
48
+ lsi.add_item @str5, "Bird"
49
+
50
+ assert_equal "Dog", lsi.classify( @str1 )
51
+ assert_equal "Cat", lsi.classify( @str3 )
52
+ assert_equal "Bird", lsi.classify( @str5 )
53
+ assert_equal "Dog", lsi.classify( @str6 )
54
+ assert_equal "Bird", lsi.classify( @str7 )
55
+ assert_equal "Bird", lsi.classify( @str8 )
56
+ end
57
+
58
+ def test_multiple_categorizing
59
+ lsi = Classifier::LSI.new
60
+ lsi.add_item @str1, "Dog"
61
+ lsi.add_item @str2, "Dog"
62
+ lsi.add_item @str3, "Cat"
63
+ lsi.add_item @str4, "Cat"
64
+ lsi.add_item @str5, "Bird"
65
+
66
+ assert_equal ["Dog", "Bird"], lsi.classify_multiple( @str6 )
67
+ assert_equal ["Cat", "Bird"], lsi.classify_multiple( @str7 )
68
+ assert_equal ["Bird"], lsi.classify_multiple( @str8 )
69
+ end
70
+
71
+ def test_multiple_categorizing_reverse
72
+ lsi = Classifier::LSI.new
73
+ lsi.add_item @str1, "Dog"
74
+ lsi.add_item @str3, "Cat"
75
+ lsi.add_item @str4, "Cat"
76
+ lsi.add_item @str6, "Dog", "Bird", "Flying"
77
+ lsi.add_item @str7, "Cat", "Bird"
78
+ lsi.add_item @str8, "Bird", "Dog", "Cat"
79
+
80
+ assert_equal ["Dog"], lsi.classify_multiple( @str2 )
81
+ assert_equal ["Cat", "Bird"], lsi.classify_multiple( @str5 )
82
+
83
+ # test with a word unknown alone
84
+ assert_equal "Bird", lsi.classify( "Bird!" )
85
+ assert_equal ["Bird", "Dog", "Cat"], lsi.classify_multiple( "Bird!" )
86
+ end
87
+
88
+ def test_external_classifying
89
+ lsi = Classifier::LSI.new
90
+ bayes = Classifier::Bayes.new :categories => ['Dog', 'Cat', 'Bird']
91
+ lsi.add_item @str1, "Dog" ; bayes.train_dog @str1
92
+ lsi.add_item @str2, "Dog" ; bayes.train_dog @str2
93
+ lsi.add_item @str3, "Cat" ; bayes.train_cat @str3
94
+ lsi.add_item @str4, "Cat" ; bayes.train_cat @str4
95
+ lsi.add_item @str5, "Bird" ; bayes.train_bird @str5
96
+
97
+ # We're talking about dogs. Even though the text matches the corpus on
98
+ # cats better. Dogs have more semantic weight than cats. So bayes
99
+ # will fail here, but the LSI recognizes content.
100
+ tricky_case = "This text revolves around dogs."
101
+ assert_equal "Dog", lsi.classify( tricky_case )
102
+ assert_not_equal "Dog", bayes.classify( tricky_case )
103
+ end
104
+
105
+ def test_recategorize_interface
106
+ lsi = Classifier::LSI.new
107
+ lsi.add_item @str1, "Dog"
108
+ lsi.add_item @str2, "Dog"
109
+ lsi.add_item @str3, "Cat"
110
+ lsi.add_item @str4, "Cat"
111
+ lsi.add_item @str5, "Bird"
112
+
113
+ tricky_case = "This text revolves around dogs."
114
+ assert_equal "Dog", lsi.classify( tricky_case )
115
+
116
+ # Recategorize as needed.
117
+ lsi.categories_for(@str1).clear.push "Cow"
118
+ lsi.categories_for(@str2).clear.push "Cow"
119
+
120
+ assert !lsi.needs_rebuild?
121
+ assert_equal "Cow", lsi.classify( tricky_case )
122
+ end
123
+
124
+ def test_search
125
+ lsi = Classifier::LSI.new
126
+ [@str1, @str2, @str3, @str4, @str5].each { |x| lsi << x }
127
+
128
+ # Searching by content and text, note that @str2 comes up first, because
129
+ # both "dog" and "involve" are present. But, the next match is @str1 instead
130
+ # of @str4, because "dog" carries more weight than involves.
131
+ assert_equal( [@str2, @str1, @str4, @str5, @str3],
132
+ lsi.search("dog involves", 100) )
133
+
134
+ # Keyword search shows how the space is mapped out in relation to
135
+ # dog when magnitude is remove. Note the relations. We move from dog
136
+ # through involve and then finally to other words.
137
+ assert_equal( [@str1, @str2, @str4, @str5, @str3],
138
+ lsi.search("dog", 5) )
139
+ end
140
+
141
+ def test_serialize_safe
142
+ lsi = Classifier::LSI.new
143
+ [@str1, @str2, @str3, @str4, @str5].each { |x| lsi << x }
144
+
145
+ lsi_md = Marshal.dump lsi
146
+ lsi_m = Marshal.load lsi_md
147
+
148
+ assert_equal lsi_m.search("cat", 3), lsi.search("cat", 3)
149
+ assert_equal lsi_m.find_related(@str1, 3), lsi.find_related(@str1, 3)
150
+ end
151
+
152
+ def test_keyword_search
153
+ lsi = Classifier::LSI.new
154
+ lsi.add_item @str1, "Dog"
155
+ lsi.add_item @str2, "Dog"
156
+ lsi.add_item @str3, "Cat"
157
+ lsi.add_item @str4, "Cat"
158
+ lsi.add_item @str5, "Bird"
159
+
160
+ assert_equal ['dog', 'text', 'deal'], lsi.highest_ranked_stems(@str1)
161
+ end
162
+
163
+ def test_summary
164
+ assert_equal "This text involves dogs too [...] This text also involves cats", [@str1, @str2, @str3, @str4, @str5].join.summary(2)
165
+ end
166
+
167
+ end
@@ -0,0 +1,38 @@
1
+ # coding:utf-8
2
+ require File.dirname(__FILE__) + '/test_helper'
3
+ require 'tempfile'
4
+
5
+ class StopWordsTest < Test::Unit::TestCase
6
+ def test_en
7
+ assert_equal 80, Classifier::StopWords.for('en').size
8
+ end
9
+
10
+ def test_ru
11
+ assert_equal 159, Classifier::StopWords.for('ru').size
12
+ end
13
+
14
+ def test_stopword_es
15
+ list = Classifier::StopWords.for('es')
16
+ assert list.include?('más')
17
+ end
18
+
19
+ def test_unknown
20
+ assert_equal [], Classifier::StopWords.for('_unknown_')
21
+ end
22
+
23
+ def setup
24
+ @tmp = nil
25
+ end
26
+ def teardown
27
+ Classifier::StopWords.reset
28
+ File.delete(@tmp) unless @tmp.nil?
29
+ end
30
+
31
+ def test_custom_lang_file
32
+ lang = 'xxyyzz'
33
+ @tmp = File.join(File.dirname(__FILE__), lang)
34
+ File.open(@tmp, 'w') { |f| f.puts "str1\nstr2" }
35
+ assert_equal ["str1", "str2"], Classifier::StopWords.for(lang,
36
+ File.dirname(@tmp))
37
+ end
38
+ end