logankoester-classifier 1.4.3

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,161 @@
1
+ # Russian stopwords
2
+ # http://snowball.tartarus.org/algorithms/russian/stop.txt
3
+ и # and
4
+ в # in/into
5
+ во # alternative form
6
+ не # not
7
+ что # what/that
8
+ он # he
9
+ на # on/onto
10
+ я # i
11
+ с # from
12
+ со # alternative form
13
+ как # how
14
+ а # milder form of `no' (but)
15
+ то # conjunction and form of `that'
16
+ все # all
17
+ она # she
18
+ так # so, thus
19
+ его # him
20
+ но # but
21
+ да # yes/and
22
+ ты # thou
23
+ к # towards, by
24
+ у # around, chez
25
+ же # intensifier particle
26
+ вы # you
27
+ за # beyond, behind
28
+ бы # conditional/subj. particle
29
+ по # up to, along
30
+ только # only
31
+ ее # her
32
+ мне # to me
33
+ было # it was
34
+ вот # here is/are, particle
35
+ от # away from
36
+ меня # me
37
+ еще # still, yet, more
38
+ нет # no, there isnt/arent
39
+ о # about
40
+ из # out of
41
+ ему # to him
42
+ теперь # now
43
+ когда # when
44
+ даже # even
45
+ ну # so, well
46
+ вдруг # suddenly
47
+ ли # interrogative particle
48
+ если # if
49
+ уже # already, but homonym of `narrower'
50
+ или # or
51
+ ни # neither
52
+ быть # to be
53
+ был # he was
54
+ него # prepositional form of его
55
+ до # up to
56
+ вас # you accusative
57
+ нибудь # indef. suffix preceded by hyphen
58
+ опять # again
59
+ уж # already, but homonym of `adder'
60
+ вам # to you
61
+ сказал # he said
62
+ ведь # particle `after all'
63
+ там # there
64
+ потом # then
65
+ себя # oneself
66
+ ничего # nothing
67
+ ей # to her
68
+ может # usually with `быть' as `maybe'
69
+ они # they
70
+ тут # here
71
+ где # where
72
+ есть # there is/are
73
+ надо # got to, must
74
+ ней # prepositional form of ей
75
+ для # for
76
+ мы # we
77
+ тебя # thee
78
+ их # them, their
79
+ чем # than
80
+ была # she was
81
+ сам # self
82
+ чтоб # in order to
83
+ без # without
84
+ будто # as if
85
+ человек # man, person, one
86
+ чего # genitive form of `what'
87
+ раз # once
88
+ тоже # also
89
+ себе # to oneself
90
+ под # beneath
91
+ жизнь # life
92
+ будет # will be
93
+ ж # short form of intensifer particle `же'
94
+ тогда # then
95
+ кто # who
96
+ этот # this
97
+ говорил # was saying
98
+ того # genitive form of `that'
99
+ потому # for that reason
100
+ этого # genitive form of `this'
101
+ какой # which
102
+ совсем # altogether
103
+ ним # prepositional form of `его', `они'
104
+ здесь # here
105
+ этом # prepositional form of `этот'
106
+ один # one
107
+ почти # almost
108
+ мой # my
109
+ тем # instrumental/dative plural of `тот', `то'
110
+ чтобы # full form of `in order that'
111
+ нее # her (acc.)
112
+ кажется # it seems
113
+ сейчас # now
114
+ были # they were
115
+ куда # where to
116
+ зачем # why
117
+ сказать # to say
118
+ всех # all (acc., gen. preposn. plural)
119
+ никогда # never
120
+ сегодня # today
121
+ можно # possible, one can
122
+ при # by
123
+ наконец # finally
124
+ два # two
125
+ об # alternative form of `о', about
126
+ другой # another
127
+ хоть # even
128
+ после # after
129
+ над # above
130
+ больше # more
131
+ тот # that one (masc.)
132
+ через # across, in
133
+ эти # these
134
+ нас # us
135
+ про # about
136
+ всего # in all, only, of all
137
+ них # prepositional form of `они' (they)
138
+ какая # which, feminine
139
+ много # lots
140
+ разве # interrogative particle
141
+ сказала # she said
142
+ три # three
143
+ эту # this, acc. fem. sing.
144
+ моя # my, feminine
145
+ впрочем # moreover, besides
146
+ хорошо # good
147
+ свою # ones own, acc. fem. sing.
148
+ этой # oblique form of `эта', fem. `this'
149
+ перед # in front of
150
+ иногда # sometimes
151
+ лучше # better
152
+ чуть # a little
153
+ том # preposn. form of `that one'
154
+ нельзя # one must not
155
+ такой # such a one
156
+ им # to them
157
+ более # more
158
+ всегда # always
159
+ конечно # of course
160
+ всю # acc. fem. sing of `all'
161
+ между # between
data/lib/init.rb ADDED
@@ -0,0 +1 @@
1
+ require 'classifier'
data/tasks/test.rake ADDED
@@ -0,0 +1,6 @@
1
+ require 'rake/testtask'
2
+
3
+
4
+ Rake::TestTask.new do |t|
5
+ t.test_files = FileList['test/**/*_test.rb']
6
+ end
data/test/base_test.rb ADDED
@@ -0,0 +1,17 @@
1
+ require File.dirname(__FILE__) + '/test_helper'
2
+ class HelpersTest < Test::Unit::TestCase
3
+
4
+ def test_word_hash
5
+ c = Classifier::Base.new
6
+ hash = {'good'=>1, "!"=>1, 'hope'=>1, "'"=>1, "."=>1, 'love'=>1, 'word'=>1, 'them'=>1, 'test'=>1}
7
+ assert_equal hash, c.word_hash("here are some good words of test's. I hope you love them!")
8
+ end
9
+
10
+
11
+ def test_clean_word_hash
12
+ c = Classifier::Base.new
13
+ hash = {'good'=>1, 'word'=>1, 'hope'=>1, 'love'=>1, 'them'=>1, 'test'=>1}
14
+ assert_equal hash, c.clean_word_hash("here are some good words of test's. I hope you love them!")
15
+ end
16
+
17
+ end
@@ -0,0 +1,68 @@
1
+ # coding:utf-8
2
+
3
+ require File.dirname(__FILE__) + '/../test_helper'
4
+
5
+ class BayesianTest < Test::Unit::TestCase
6
+ def setup
7
+ @classifier = Classifier::Bayes.new :categories => ['Interesting', 'Uninteresting']
8
+ end
9
+
10
+ def test_good_training
11
+ assert_nothing_raised { @classifier.train_interesting "love" }
12
+ end
13
+
14
+ def test_bad_training
15
+ assert_raise(StandardError) { @classifier.train_no_category "words" }
16
+ end
17
+
18
+ def test_bad_method
19
+ assert_raise(NoMethodError) { @classifier.forget_everything_you_know "" }
20
+ end
21
+
22
+ def test_categories
23
+ assert_equal ['Interesting', 'Uninteresting'].sort, @classifier.categories.sort
24
+ end
25
+
26
+ def test_add_category
27
+ @classifier.add_category 'Test'
28
+ assert_equal ['Test', 'Interesting', 'Uninteresting'].sort, @classifier.categories.sort
29
+ end
30
+
31
+ def test_classification
32
+ @classifier.train_interesting "here are some good words. I hope you love them"
33
+ @classifier.train_uninteresting "here are some bad words, I hate you"
34
+ assert_equal 'Uninteresting', @classifier.classify("I hate bad words and you")
35
+ end
36
+
37
+ def test_regression_untrain_nil_fixnum
38
+ # if a word of the untraining text is not present on the category, a
39
+ # "TypeError: nil can't be coerced into Fixnum" is raised
40
+ @classifier.untrain_interesting "nothing"
41
+ end
42
+
43
+ def test_ru_classification
44
+ c = Classifier::Bayes.new :categories => ['Interesting', 'Uninteresting'], :language => "ru"
45
+ c.train_interesting "вот несколько хороших слов. Я надеюсь вам они понравились"
46
+ c.train_uninteresting "вот несколько плохих слов. Я тебя ненавижу"
47
+ assert_equal 'Uninteresting', c.classify("Я ненавижу плохие слова и тебя")
48
+ end
49
+
50
+ def test_case_insensitive
51
+ c = Classifier::Bayes.new :categories => [:good, :bad], :language => "ru"
52
+ c.train_good "Хорошо"
53
+ c.train_bad "Плохо"
54
+
55
+ assert_equal c.classifications("ХОРОШО"), c.classifications("хорошо")
56
+ assert_equal c.classifications("плОХО"), c.classifications("плохо")
57
+ end
58
+
59
+ def test_serialize
60
+ txt = "this can be serialized"
61
+ @classifier.train_interesting(txt)
62
+ @classifier.train_uninteresting("really uninteresting")
63
+
64
+ b2 = Marshal::load(Marshal::dump(@classifier))
65
+ assert_equal @classifier.classify(txt), b2.classify(txt)
66
+ end
67
+
68
+ end
@@ -0,0 +1,167 @@
1
+ require File.dirname(__FILE__) + '/../test_helper'
2
+ class LSITest < Test::Unit::TestCase
3
+ def setup
4
+ # we repeat principle words to help weight them.
5
+ # This test is rather delicate, since this system is mostly noise.
6
+ @str1 = "This text deals with dogs. Dogs."
7
+ @str2 = "This text involves dogs too. Dogs! "
8
+ @str3 = "This text revolves around cats. Cats."
9
+ @str4 = "This text also involves cats. Cats!"
10
+ @str5 = "This text involves birds. Birds."
11
+ @str6 = "Is it about dogs or birds?"
12
+ @str7 = "Is it about birds or cats?"
13
+ @str8 = "I would prefer a bird over thousand cats or dogs because birds are smaller."
14
+ end
15
+
16
+ def test_basic_indexing
17
+ lsi = Classifier::LSI.new
18
+ [@str1, @str2, @str3, @str4, @str5].each { |x| lsi << x }
19
+ assert ! lsi.needs_rebuild?
20
+
21
+ # note that the closest match to str1 is str2, even though it is not
22
+ # the closest text match.
23
+ assert_equal [@str2, @str5, @str3], lsi.find_related(@str1, 3)
24
+ end
25
+
26
+ def test_not_auto_rebuild
27
+ lsi = Classifier::LSI.new :auto_rebuild => false
28
+ lsi.add_item @str1, "Dog"
29
+ lsi.add_item @str2, "Dog"
30
+ assert lsi.needs_rebuild?
31
+ lsi.build_index
32
+ assert ! lsi.needs_rebuild?
33
+ end
34
+
35
+ def test_basic_categorizing_with_too_small_dataset
36
+ lsi = Classifier::LSI.new
37
+ lsi.add_item @str2, "Dog"
38
+
39
+ assert_equal nil, lsi.classify( @str1 )
40
+ assert_equal [], lsi.classify_multiple( @str3 )
41
+ end
42
+
43
+ def test_basic_categorizing
44
+ lsi = Classifier::LSI.new
45
+ lsi.add_item @str2, "Dog"
46
+ lsi.add_item @str3, "Cat"
47
+ lsi.add_item @str4, "Cat"
48
+ lsi.add_item @str5, "Bird"
49
+
50
+ assert_equal "Dog", lsi.classify( @str1 )
51
+ assert_equal "Cat", lsi.classify( @str3 )
52
+ assert_equal "Bird", lsi.classify( @str5 )
53
+ assert_equal "Dog", lsi.classify( @str6 )
54
+ assert_equal "Bird", lsi.classify( @str7 )
55
+ assert_equal "Bird", lsi.classify( @str8 )
56
+ end
57
+
58
+ def test_multiple_categorizing
59
+ lsi = Classifier::LSI.new
60
+ lsi.add_item @str1, "Dog"
61
+ lsi.add_item @str2, "Dog"
62
+ lsi.add_item @str3, "Cat"
63
+ lsi.add_item @str4, "Cat"
64
+ lsi.add_item @str5, "Bird"
65
+
66
+ assert_equal ["Dog", "Bird"], lsi.classify_multiple( @str6 )
67
+ assert_equal ["Cat", "Bird"], lsi.classify_multiple( @str7 )
68
+ assert_equal ["Bird"], lsi.classify_multiple( @str8 )
69
+ end
70
+
71
+ def test_multiple_categorizing_reverse
72
+ lsi = Classifier::LSI.new
73
+ lsi.add_item @str1, "Dog"
74
+ lsi.add_item @str3, "Cat"
75
+ lsi.add_item @str4, "Cat"
76
+ lsi.add_item @str6, "Dog", "Bird", "Flying"
77
+ lsi.add_item @str7, "Cat", "Bird"
78
+ lsi.add_item @str8, "Bird", "Dog", "Cat"
79
+
80
+ assert_equal ["Dog"], lsi.classify_multiple( @str2 )
81
+ assert_equal ["Cat", "Bird"], lsi.classify_multiple( @str5 )
82
+
83
+ # test with a word unknown alone
84
+ assert_equal "Bird", lsi.classify( "Bird!" )
85
+ assert_equal ["Bird", "Dog", "Cat"], lsi.classify_multiple( "Bird!" )
86
+ end
87
+
88
+ def test_external_classifying
89
+ lsi = Classifier::LSI.new
90
+ bayes = Classifier::Bayes.new :categories => ['Dog', 'Cat', 'Bird']
91
+ lsi.add_item @str1, "Dog" ; bayes.train_dog @str1
92
+ lsi.add_item @str2, "Dog" ; bayes.train_dog @str2
93
+ lsi.add_item @str3, "Cat" ; bayes.train_cat @str3
94
+ lsi.add_item @str4, "Cat" ; bayes.train_cat @str4
95
+ lsi.add_item @str5, "Bird" ; bayes.train_bird @str5
96
+
97
+ # We're talking about dogs. Even though the text matches the corpus on
98
+ # cats better. Dogs have more semantic weight than cats. So bayes
99
+ # will fail here, but the LSI recognizes content.
100
+ tricky_case = "This text revolves around dogs."
101
+ assert_equal "Dog", lsi.classify( tricky_case )
102
+ assert_not_equal "Dog", bayes.classify( tricky_case )
103
+ end
104
+
105
+ def test_recategorize_interface
106
+ lsi = Classifier::LSI.new
107
+ lsi.add_item @str1, "Dog"
108
+ lsi.add_item @str2, "Dog"
109
+ lsi.add_item @str3, "Cat"
110
+ lsi.add_item @str4, "Cat"
111
+ lsi.add_item @str5, "Bird"
112
+
113
+ tricky_case = "This text revolves around dogs."
114
+ assert_equal "Dog", lsi.classify( tricky_case )
115
+
116
+ # Recategorize as needed.
117
+ lsi.categories_for(@str1).clear.push "Cow"
118
+ lsi.categories_for(@str2).clear.push "Cow"
119
+
120
+ assert !lsi.needs_rebuild?
121
+ assert_equal "Cow", lsi.classify( tricky_case )
122
+ end
123
+
124
+ def test_search
125
+ lsi = Classifier::LSI.new
126
+ [@str1, @str2, @str3, @str4, @str5].each { |x| lsi << x }
127
+
128
+ # Searching by content and text, note that @str2 comes up first, because
129
+ # both "dog" and "involve" are present. But, the next match is @str1 instead
130
+ # of @str4, because "dog" carries more weight than involves.
131
+ assert_equal( [@str2, @str1, @str4, @str5, @str3],
132
+ lsi.search("dog involves", 100) )
133
+
134
+ # Keyword search shows how the space is mapped out in relation to
135
+ # dog when magnitude is remove. Note the relations. We move from dog
136
+ # through involve and then finally to other words.
137
+ assert_equal( [@str1, @str2, @str4, @str5, @str3],
138
+ lsi.search("dog", 5) )
139
+ end
140
+
141
+ def test_serialize_safe
142
+ lsi = Classifier::LSI.new
143
+ [@str1, @str2, @str3, @str4, @str5].each { |x| lsi << x }
144
+
145
+ lsi_md = Marshal.dump lsi
146
+ lsi_m = Marshal.load lsi_md
147
+
148
+ assert_equal lsi_m.search("cat", 3), lsi.search("cat", 3)
149
+ assert_equal lsi_m.find_related(@str1, 3), lsi.find_related(@str1, 3)
150
+ end
151
+
152
+ def test_keyword_search
153
+ lsi = Classifier::LSI.new
154
+ lsi.add_item @str1, "Dog"
155
+ lsi.add_item @str2, "Dog"
156
+ lsi.add_item @str3, "Cat"
157
+ lsi.add_item @str4, "Cat"
158
+ lsi.add_item @str5, "Bird"
159
+
160
+ assert_equal ['dog', 'text', 'deal'], lsi.highest_ranked_stems(@str1)
161
+ end
162
+
163
+ def test_summary
164
+ assert_equal "This text involves dogs too [...] This text also involves cats", [@str1, @str2, @str3, @str4, @str5].join.summary(2)
165
+ end
166
+
167
+ end
@@ -0,0 +1,38 @@
1
+ # coding:utf-8
2
+ require File.dirname(__FILE__) + '/test_helper'
3
+ require 'tempfile'
4
+
5
+ class StopWordsTest < Test::Unit::TestCase
6
+ def test_en
7
+ assert_equal 80, Classifier::StopWords.for('en').size
8
+ end
9
+
10
+ def test_ru
11
+ assert_equal 159, Classifier::StopWords.for('ru').size
12
+ end
13
+
14
+ def test_stopword_es
15
+ list = Classifier::StopWords.for('es')
16
+ assert list.include?('más')
17
+ end
18
+
19
+ def test_unknown
20
+ assert_equal [], Classifier::StopWords.for('_unknown_')
21
+ end
22
+
23
+ def setup
24
+ @tmp = nil
25
+ end
26
+ def teardown
27
+ Classifier::StopWords.reset
28
+ File.delete(@tmp) unless @tmp.nil?
29
+ end
30
+
31
+ def test_custom_lang_file
32
+ lang = 'xxyyzz'
33
+ @tmp = File.join(File.dirname(__FILE__), lang)
34
+ File.open(@tmp, 'w') { |f| f.puts "str1\nstr2" }
35
+ assert_equal ["str1", "str2"], Classifier::StopWords.for(lang,
36
+ File.dirname(@tmp))
37
+ end
38
+ end