classifier-reborn 2.0.3 → 2.0.4
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/README.markdown +130 -14
- data/data/stopwords/ca +126 -0
- data/data/stopwords/cs +138 -0
- data/data/stopwords/da +101 -0
- data/data/stopwords/de +604 -0
- data/data/stopwords/en +80 -0
- data/data/stopwords/es +351 -0
- data/data/stopwords/fi +747 -0
- data/data/stopwords/fr +463 -0
- data/data/stopwords/hu +35 -0
- data/data/stopwords/it +430 -0
- data/data/stopwords/nl +48 -0
- data/data/stopwords/no +119 -0
- data/data/stopwords/pl +93 -0
- data/data/stopwords/pt +356 -0
- data/data/stopwords/se +386 -0
- data/data/stopwords/tr +114 -0
- data/lib/classifier-reborn/bayes.rb +86 -16
- data/lib/classifier-reborn/category_namer.rb +3 -1
- data/lib/classifier-reborn/extensions/hasher.rb +25 -100
- data/lib/classifier-reborn/extensions/vector.rb +0 -1
- data/lib/classifier-reborn/lsi.rb +36 -25
- data/lib/classifier-reborn/lsi/cached_content_node.rb +48 -0
- data/lib/classifier-reborn/lsi/content_node.rb +27 -10
- data/lib/classifier-reborn/lsi/summarizer.rb +2 -2
- data/lib/classifier-reborn/version.rb +1 -1
- metadata +37 -3
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 193d6c53d76559337140f192fc69910418015abe
|
4
|
+
data.tar.gz: 745ec353d12ad84aaf74cca6b235401127f00075
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: c9891b16c6e9fb2ddfffd32a2335f59bfc55a5e97dae675b652acaa9122a2fc268bfc0d4c2be945456fe46397601127dd2a39782a99767718adf7fee373bdae2
|
7
|
+
data.tar.gz: af844b19d90186a6e3866cdcbfa3deb6ba8492e4646298d6712168c1a329f34fbd24c551ff241a561711a8516cf57577dd7fe1a96bb069190e8545a09a570ff0
|
data/README.markdown
CHANGED
@@ -41,25 +41,140 @@ A Bayesian classifier by Lucas Carlson. Bayesian Classifiers are accurate, fast,
|
|
41
41
|
|
42
42
|
```ruby
|
43
43
|
require 'classifier-reborn'
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
|
51
|
-
|
52
|
-
}
|
53
|
-
|
54
|
-
|
55
|
-
|
56
|
-
|
44
|
+
classifier = ClassifierReborn::Bayes.new 'Interesting', 'Uninteresting'
|
45
|
+
classifier.train_interesting "here are some good words. I hope you love them"
|
46
|
+
classifier.train_uninteresting "here are some bad words, I hate you"
|
47
|
+
classifier.classify "I hate bad words and you" # returns 'Uninteresting'
|
48
|
+
|
49
|
+
classifier_snapshot = Marshal.dump classifier
|
50
|
+
# This is a string of bytes, you can persist it anywhere you like
|
51
|
+
|
52
|
+
File.open("classifier.dat", "w") {|f| f.write(classifier_snapshot) }
|
53
|
+
# Or Redis.current.save "classifier", classifier_snapshot
|
54
|
+
|
55
|
+
# This is now saved to a file, and you can safely restart the application
|
56
|
+
data = File.read("classifier.dat")
|
57
|
+
# Or data = Redis.current.get "classifier"
|
58
|
+
trained_classifier = Marshal.load data
|
59
|
+
trained_classifier.classify "I love" # returns 'Interesting'
|
57
60
|
```
|
58
61
|
|
59
|
-
|
62
|
+
Beyond the basic example, the constructor and trainer can be used in a more
|
63
|
+
flexible way to accomidate non-trival applications. Consider the following
|
64
|
+
program:
|
65
|
+
|
66
|
+
```ruby
|
67
|
+
#!/usr/bin/env ruby
|
68
|
+
# classifier_reborn_demo.rb
|
69
|
+
|
70
|
+
require 'classifier-reborn'
|
71
|
+
|
72
|
+
training_set = DATA.read.split("\n")
|
73
|
+
categories = training_set.shift.split(',').map{|c| c.strip}
|
74
|
+
|
75
|
+
classifier = ClassifierReborn::Bayes.new categories
|
76
|
+
|
77
|
+
training_set.each do |a_line|
|
78
|
+
next if a_line.empty? || '#' == a_line.strip[0]
|
79
|
+
parts = a_line.strip.split(':')
|
80
|
+
classifier.train(parts.first, parts.last)
|
81
|
+
end
|
82
|
+
|
83
|
+
puts classifier.classify "I hate bad words and you" #=> 'Uninteresting'
|
84
|
+
puts classifier.classify "I hate javascript" #=> 'Uninteresting'
|
85
|
+
puts classifier.classify "javascript is bad" #=> 'Uninteresting'
|
86
|
+
|
87
|
+
puts classifier.classify "all you need is ruby" #=> 'Interesting'
|
88
|
+
puts classifier.classify "i love ruby" #=> 'Interesting'
|
89
|
+
|
90
|
+
puts classifier.classify "which is better dogs or cats" #=> 'dog'
|
91
|
+
puts classifier.classify "what do I need to kill rats and mice" #=> 'cat'
|
92
|
+
|
93
|
+
__END__
|
94
|
+
Interesting, Uninteresting
|
95
|
+
interesting: here are some good words. I hope you love them
|
96
|
+
interesting: all you need is love
|
97
|
+
interesting: the love boat, soon we will be taking another ride
|
98
|
+
interesting: ruby don't take your love to town
|
99
|
+
|
100
|
+
uninteresting: here are some bad words, I hate you
|
101
|
+
uninteresting: bad bad leroy brown badest man in the darn town
|
102
|
+
uninteresting: the good the bad and the ugly
|
103
|
+
uninteresting: java, javascript, css front-end html
|
104
|
+
#
|
105
|
+
# train categories that were not pre-described
|
106
|
+
#
|
107
|
+
dog: dog days of summer
|
108
|
+
dog: a man's best friend is his dog
|
109
|
+
dog: a good hunting dog is a fine thing
|
110
|
+
dog: man my dogs are tired
|
111
|
+
dog: dogs are better than cats in soooo many ways
|
112
|
+
|
113
|
+
cat: the fuzz ball spilt the milk
|
114
|
+
cat: got rats or mice get a cat to kill them
|
115
|
+
cat: cats never come when you call them
|
116
|
+
cat: That dang cat keeps scratching the furniture
|
117
|
+
```
|
118
|
+
|
119
|
+
#### Knowing the Score
|
120
|
+
|
121
|
+
When you ask a bayesian classifier to classify text against a set of trained categories it does so by generating a score (as a Float) for each possible category. The higher the score the closer the fit your text has with that category. The category with the highest score is returned as the best matching category.
|
122
|
+
|
123
|
+
In *ClassifierReborn* the methods *classifications* and *classify_with_score* give you access to the calculated scores. The method *classify* only returns the best matching category.
|
124
|
+
|
125
|
+
Knowing the score allows you to do some interesting things. For example if your application is to generate tags for a blog post you could use the *classifications* method to get a hash of the categories and their scores. You would sort on score and take only the top 3 or 4 categories as your tags for the blog post.
|
126
|
+
|
127
|
+
You could within your application establish the smallest acceptable score and only use those categories whose score is greater than or equal to your smallest acceptable score as your tags for the blog post.
|
128
|
+
|
129
|
+
But what if you only use the *classify* method? It does not show you the score of the best category. How do you know that the best category is really any good?
|
130
|
+
|
131
|
+
You can use the threshold.
|
132
|
+
|
133
|
+
#### Using the Threshold
|
134
|
+
|
135
|
+
Some applications can have only one category. The application wants to know if the text being classified is of that category or not. For example consider a list of normal free text responses to some question or maybe a URL string coming to your web application. You know what a normal response looks like; but, you have no idea how people might mis-use the response. So what you want to do is create a bayesian classifier that just has one category, for example 'Good' and you want to know wither your text is classified as Good or Not Good.
|
136
|
+
|
137
|
+
Or suppose you just want the ability to have multiple categories and a 'None of the Above' as a possibility.
|
138
|
+
|
139
|
+
##### Threshold
|
140
|
+
|
141
|
+
When you initialize the *ClassifierReborn::Bayes* classifier there are several options which can be set that control threshold processing.
|
142
|
+
|
143
|
+
```ruby
|
144
|
+
b = ClassifierRebor::Bayes.new(
|
145
|
+
'good', # one or more categories
|
146
|
+
enable_threshold: true, # default: false
|
147
|
+
threshold: -10.0 # default: 0.0
|
148
|
+
)
|
149
|
+
b.train_good 'good stuff from Dobie Gillis'
|
150
|
+
# ...
|
151
|
+
text = 'bad junk from Maynard G. Krebs'
|
152
|
+
result = b.classify text
|
153
|
+
if result.nil?
|
154
|
+
STDERR.puts "ALERT: This is not good: #{text}"
|
155
|
+
let_loose_the_dogs_of_war! # method definition left to the reader
|
156
|
+
end
|
157
|
+
|
158
|
+
```
|
159
|
+
|
160
|
+
In the *classify* method when the best category for the text has a score that is either less than the established threshold or is Float::INIFINITY, a nil category is returned. When you see a nil value returned from the *classify* method it means that none of the trained categories (regardless or how many categories were trained) has a score that is above or equal to the established threshold.
|
161
|
+
|
162
|
+
#### Other Threshold-related Convience Methods
|
163
|
+
|
164
|
+
```ruby
|
165
|
+
b.threshold # get the current threshold
|
166
|
+
b.threshold = -10.0 # set the threshold
|
167
|
+
b.threshold_enabled? # Boolean: is the threshold enabled?
|
168
|
+
b.threshold_disabled? # Boolean: is the threshold disabled?
|
169
|
+
b.enable_threshold # enables threshold processing
|
170
|
+
b.disable_threshold # disables threshold processing
|
171
|
+
```
|
172
|
+
|
173
|
+
Using these convience methods your applications can dynamically adjust threshold processing as required.
|
60
174
|
|
61
175
|
### Bayesian Classification
|
62
176
|
|
177
|
+
* https://en.wikipedia.org/wiki/Naive_Bayes_classifier
|
63
178
|
* http://www.process.com/precisemail/bayesian_filtering.htm
|
64
179
|
* http://en.wikipedia.org/wiki/Bayesian_filtering
|
65
180
|
* http://www.paulgraham.com/spam.html
|
@@ -110,5 +225,6 @@ with more than just simple strings.
|
|
110
225
|
* Cameron McBride (cameron.mcbride@gmail.com)
|
111
226
|
* Ivan Acosta-Rubio (ivan@softwarecriollo.com)
|
112
227
|
* Parker Moore (email@byparker.com)
|
228
|
+
* Chase Gilliam (chase.gilliam@gmail.com)
|
113
229
|
|
114
230
|
This library is released under the terms of the GNU LGPL. See LICENSE for more details.
|
data/data/stopwords/ca
ADDED
@@ -0,0 +1,126 @@
|
|
1
|
+
de
|
2
|
+
es
|
3
|
+
i
|
4
|
+
a
|
5
|
+
o
|
6
|
+
un
|
7
|
+
una
|
8
|
+
unes
|
9
|
+
uns
|
10
|
+
un
|
11
|
+
tot
|
12
|
+
també
|
13
|
+
altre
|
14
|
+
algun
|
15
|
+
alguna
|
16
|
+
alguns
|
17
|
+
algunes
|
18
|
+
ser
|
19
|
+
és
|
20
|
+
soc
|
21
|
+
ets
|
22
|
+
som
|
23
|
+
estic
|
24
|
+
està
|
25
|
+
estem
|
26
|
+
esteu
|
27
|
+
estan
|
28
|
+
com
|
29
|
+
en
|
30
|
+
per
|
31
|
+
perquè
|
32
|
+
per que
|
33
|
+
estat
|
34
|
+
estava
|
35
|
+
ans
|
36
|
+
abans
|
37
|
+
éssent
|
38
|
+
ambdós
|
39
|
+
però
|
40
|
+
per
|
41
|
+
poder
|
42
|
+
potser
|
43
|
+
puc
|
44
|
+
podem
|
45
|
+
podeu
|
46
|
+
poden
|
47
|
+
vaig
|
48
|
+
va
|
49
|
+
van
|
50
|
+
fer
|
51
|
+
faig
|
52
|
+
fa
|
53
|
+
fem
|
54
|
+
feu
|
55
|
+
fan
|
56
|
+
cada
|
57
|
+
fi
|
58
|
+
inclòs
|
59
|
+
primer
|
60
|
+
des de
|
61
|
+
conseguir
|
62
|
+
consegueixo
|
63
|
+
consigueix
|
64
|
+
consigueixes
|
65
|
+
conseguim
|
66
|
+
consigueixen
|
67
|
+
anar
|
68
|
+
haver
|
69
|
+
tenir
|
70
|
+
tinc
|
71
|
+
te
|
72
|
+
tenim
|
73
|
+
teniu
|
74
|
+
tene
|
75
|
+
el
|
76
|
+
la
|
77
|
+
les
|
78
|
+
els
|
79
|
+
seu
|
80
|
+
aquí
|
81
|
+
meu
|
82
|
+
teu
|
83
|
+
ells
|
84
|
+
elles
|
85
|
+
ens
|
86
|
+
nosaltres
|
87
|
+
vosaltres
|
88
|
+
si
|
89
|
+
dins
|
90
|
+
sols
|
91
|
+
solament
|
92
|
+
saber
|
93
|
+
saps
|
94
|
+
sap
|
95
|
+
sabem
|
96
|
+
sabeu
|
97
|
+
saben
|
98
|
+
últim
|
99
|
+
llarg
|
100
|
+
bastant
|
101
|
+
fas
|
102
|
+
molts
|
103
|
+
aquells
|
104
|
+
aquelles
|
105
|
+
seus
|
106
|
+
llavors
|
107
|
+
sota
|
108
|
+
dalt
|
109
|
+
ús
|
110
|
+
molt
|
111
|
+
era
|
112
|
+
eres
|
113
|
+
erem
|
114
|
+
eren
|
115
|
+
mode
|
116
|
+
bé
|
117
|
+
quant
|
118
|
+
quan
|
119
|
+
on
|
120
|
+
mentre
|
121
|
+
qui
|
122
|
+
amb
|
123
|
+
entre
|
124
|
+
sense
|
125
|
+
jo
|
126
|
+
aquell
|
data/data/stopwords/cs
ADDED
@@ -0,0 +1,138 @@
|
|
1
|
+
dnes
|
2
|
+
cz
|
3
|
+
timto
|
4
|
+
budes
|
5
|
+
budem
|
6
|
+
byli
|
7
|
+
jses
|
8
|
+
muj
|
9
|
+
svym
|
10
|
+
ta
|
11
|
+
tomto
|
12
|
+
tohle
|
13
|
+
tuto
|
14
|
+
tyto
|
15
|
+
jej
|
16
|
+
zda
|
17
|
+
proc
|
18
|
+
mate
|
19
|
+
tato
|
20
|
+
kam
|
21
|
+
tohoto
|
22
|
+
kdo
|
23
|
+
kteri
|
24
|
+
mi
|
25
|
+
nam
|
26
|
+
tom
|
27
|
+
tomuto
|
28
|
+
mit
|
29
|
+
nic
|
30
|
+
proto
|
31
|
+
kterou
|
32
|
+
byla
|
33
|
+
toho
|
34
|
+
protoze
|
35
|
+
asi
|
36
|
+
ho
|
37
|
+
nasi
|
38
|
+
napiste
|
39
|
+
re
|
40
|
+
coz
|
41
|
+
tim
|
42
|
+
takze
|
43
|
+
svych
|
44
|
+
jeji
|
45
|
+
svymi
|
46
|
+
jste
|
47
|
+
aj
|
48
|
+
tu
|
49
|
+
tedy
|
50
|
+
teto
|
51
|
+
bylo
|
52
|
+
kde
|
53
|
+
ke
|
54
|
+
prave
|
55
|
+
ji
|
56
|
+
nad
|
57
|
+
nejsou
|
58
|
+
ci
|
59
|
+
pod
|
60
|
+
tema
|
61
|
+
mezi
|
62
|
+
pres
|
63
|
+
ty
|
64
|
+
pak
|
65
|
+
vam
|
66
|
+
ani
|
67
|
+
kdyz
|
68
|
+
vsak
|
69
|
+
ne
|
70
|
+
jsem
|
71
|
+
tento
|
72
|
+
clanku
|
73
|
+
clanky
|
74
|
+
aby
|
75
|
+
jsme
|
76
|
+
pred
|
77
|
+
pta
|
78
|
+
jejich
|
79
|
+
byl
|
80
|
+
jeste
|
81
|
+
az
|
82
|
+
bez
|
83
|
+
take
|
84
|
+
pouze
|
85
|
+
prvni
|
86
|
+
vase
|
87
|
+
ktera
|
88
|
+
nas
|
89
|
+
novy
|
90
|
+
tipy
|
91
|
+
pokud
|
92
|
+
muze
|
93
|
+
design
|
94
|
+
strana
|
95
|
+
jeho
|
96
|
+
sve
|
97
|
+
jine
|
98
|
+
zpravy
|
99
|
+
nove
|
100
|
+
neni
|
101
|
+
vas
|
102
|
+
jen
|
103
|
+
podle
|
104
|
+
zde
|
105
|
+
clanek
|
106
|
+
uz
|
107
|
+
email
|
108
|
+
byt
|
109
|
+
vice
|
110
|
+
bude
|
111
|
+
jiz
|
112
|
+
nez
|
113
|
+
ktery
|
114
|
+
by
|
115
|
+
ktere
|
116
|
+
co
|
117
|
+
nebo
|
118
|
+
ten
|
119
|
+
tak
|
120
|
+
ma
|
121
|
+
pri
|
122
|
+
od
|
123
|
+
po
|
124
|
+
jsou
|
125
|
+
jak
|
126
|
+
dalsi
|
127
|
+
ale
|
128
|
+
si
|
129
|
+
ve
|
130
|
+
to
|
131
|
+
jako
|
132
|
+
za
|
133
|
+
zpet
|
134
|
+
ze
|
135
|
+
do
|
136
|
+
pro
|
137
|
+
je
|
138
|
+
na
|
data/data/stopwords/da
ADDED
@@ -0,0 +1,101 @@
|
|
1
|
+
af
|
2
|
+
alle
|
3
|
+
andet
|
4
|
+
andre
|
5
|
+
at
|
6
|
+
begge
|
7
|
+
da
|
8
|
+
de
|
9
|
+
den
|
10
|
+
denne
|
11
|
+
der
|
12
|
+
deres
|
13
|
+
det
|
14
|
+
dette
|
15
|
+
dig
|
16
|
+
din
|
17
|
+
dog
|
18
|
+
du
|
19
|
+
ej
|
20
|
+
eller
|
21
|
+
en
|
22
|
+
end
|
23
|
+
ene
|
24
|
+
eneste
|
25
|
+
enhver
|
26
|
+
et
|
27
|
+
fem
|
28
|
+
fire
|
29
|
+
flere
|
30
|
+
fleste
|
31
|
+
for
|
32
|
+
fordi
|
33
|
+
forrige
|
34
|
+
fra
|
35
|
+
få
|
36
|
+
før
|
37
|
+
god
|
38
|
+
han
|
39
|
+
hans
|
40
|
+
har
|
41
|
+
hendes
|
42
|
+
her
|
43
|
+
hun
|
44
|
+
hvad
|
45
|
+
hvem
|
46
|
+
hver
|
47
|
+
hvilken
|
48
|
+
hvis
|
49
|
+
hvor
|
50
|
+
hvordan
|
51
|
+
hvorfor
|
52
|
+
hvornår
|
53
|
+
i
|
54
|
+
ikke
|
55
|
+
ind
|
56
|
+
ingen
|
57
|
+
intet
|
58
|
+
jeg
|
59
|
+
jeres
|
60
|
+
kan
|
61
|
+
kom
|
62
|
+
kommer
|
63
|
+
lav
|
64
|
+
lidt
|
65
|
+
lille
|
66
|
+
man
|
67
|
+
mand
|
68
|
+
mange
|
69
|
+
med
|
70
|
+
meget
|
71
|
+
men
|
72
|
+
mens
|
73
|
+
mere
|
74
|
+
mig
|
75
|
+
ned
|
76
|
+
ni
|
77
|
+
nogen
|
78
|
+
noget
|
79
|
+
ny
|
80
|
+
nyt
|
81
|
+
nær
|
82
|
+
næste
|
83
|
+
næsten
|
84
|
+
og
|
85
|
+
op
|
86
|
+
otte
|
87
|
+
over
|
88
|
+
på
|
89
|
+
se
|
90
|
+
seks
|
91
|
+
ses
|
92
|
+
som
|
93
|
+
stor
|
94
|
+
store
|
95
|
+
syv
|
96
|
+
ti
|
97
|
+
til
|
98
|
+
to
|
99
|
+
tre
|
100
|
+
ud
|
101
|
+
var
|