fast_bayes 1.0.1 → 1.1.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/.travis.yml +10 -2
- data/README.md +16 -3
- data/benchmarks/bench.rb +46 -0
- data/bin/setup +1 -2
- data/ext/Makefile +1 -1
- data/ext/fast_bayes/Makefile +6 -6
- data/ext/fast_bayes/bayes.h +35 -24
- data/ext/fast_bayes/extconf.rb +2 -0
- data/ext/fast_bayes/stopwords.h +23 -0
- data/ext/fast_bayes/stopwords/en +174 -0
- data/ext/test/classification_test.cpp +5 -5
- data/ext/test/data/r8-test-all-terms.txt +2189 -0
- data/ext/test/data/r8-train-all-terms.txt +5485 -0
- data/fast_bayes.gemspec +1 -0
- data/lib/fast_bayes.rb +20 -2
- data/lib/fast_bayes/version.rb +1 -1
- metadata +27 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: dc4b2951cd54f309119ba86832defb867967acd4
|
4
|
+
data.tar.gz: 9ac9c0aa6d4925229a3f2255171358c8d2146c37
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: bc0e2b924f9c6d866b248e271923d73dc1232ba386505bfa8ee9af589d36c6da37afce753d9c6b464d7ade32edfab3a832aa7349d89a429720311517ab745a23
|
7
|
+
data.tar.gz: ac85cb50805f784e8b78c5dcfcfa2fd91af017ef7569bf12b094971e67b2d9d780e8e4e173ddd666afd6a70d7cc87db0419ef4e3c3087ceffc43061e3600f9f0
|
data/.travis.yml
CHANGED
data/README.md
CHANGED
@@ -1,9 +1,11 @@
|
|
1
|
-
# FastBayes
|
1
|
+
# FastBayes [![Build Status](https://travis-ci.org/Coolnesss/fast-bayes.svg?branch=master)](https://travis-ci.org/Coolnesss/fast-bayes)
|
2
2
|
|
3
3
|
A fast implementation of the naive Bayes classification algorithm. Written in C++ with an interface for Ruby using [Rice](https://github.com/jasonroelofs/rice).
|
4
4
|
|
5
5
|
Performs text classification with no separate training step needed, the cost of training is split between classification and observation. This is especially useful when data is an online stream, as the system can gradually improve.
|
6
6
|
|
7
|
+
FastBayes supports any number of classes and they don't need to be added in advance.
|
8
|
+
|
7
9
|
## Installation
|
8
10
|
|
9
11
|
Add this line to your application's Gemfile:
|
@@ -22,7 +24,18 @@ Or install it yourself as:
|
|
22
24
|
|
23
25
|
## Usage
|
24
26
|
|
25
|
-
|
27
|
+
```ruby
|
28
|
+
[1] pry(main)> require 'fast_bayes'
|
29
|
+
=> true
|
30
|
+
[2] pry(main)> b = FastBayes.new
|
31
|
+
=> #<FastBayes:0x00000002cb5d98>
|
32
|
+
[3] pry(main)> b.observe "This sentence is good", "Good"
|
33
|
+
=> nil
|
34
|
+
[4] pry(main)> b.observe "This sentence is bad", "Bad"
|
35
|
+
=> nil
|
36
|
+
[5] pry(main)> b.classify "good stuff"
|
37
|
+
=> "Good"
|
38
|
+
```
|
26
39
|
|
27
40
|
## Development
|
28
41
|
|
@@ -32,7 +45,7 @@ To install this gem onto your local machine, run `bundle exec rake install`. To
|
|
32
45
|
|
33
46
|
## Contributing
|
34
47
|
|
35
|
-
Bug reports and pull requests are welcome on GitHub at https://github.com/
|
48
|
+
Bug reports and pull requests are welcome on GitHub at https://github.com/Coolnesss/fast-bayes.
|
36
49
|
|
37
50
|
|
38
51
|
## License
|
data/benchmarks/bench.rb
ADDED
@@ -0,0 +1,46 @@
|
|
1
|
+
require 'benchmark'
|
2
|
+
require '/home/chang/fast_bayes/lib/fast_bayes/fast_bayes'
|
3
|
+
require 'classifier-reborn' # Install from rubygems
|
4
|
+
|
5
|
+
|
6
|
+
# Read newspaper data
|
7
|
+
# Data from http://ana.cachopo.org/datasets-for-single-label-text-categorization
|
8
|
+
d = IO.read("ext/test/data/r8-train-all-terms.txt")
|
9
|
+
training = d.split("\n").map{|x| x.split("\t")}
|
10
|
+
|
11
|
+
d = IO.read("ext/test/data/r8-test-all-terms.txt")
|
12
|
+
test = d.split("\n").map{|x| x.split("\t")}
|
13
|
+
test = test.shuffle
|
14
|
+
|
15
|
+
n = test.size
|
16
|
+
# FastBayes
|
17
|
+
puts Benchmark.measure {
|
18
|
+
b = FastBayes.new
|
19
|
+
# Train all the training samples
|
20
|
+
training.each do |t|
|
21
|
+
b.observe(t[1], t.first)
|
22
|
+
end
|
23
|
+
|
24
|
+
errors = 0
|
25
|
+
|
26
|
+
# Classify random test data
|
27
|
+
test.take(n).each do |t|
|
28
|
+
errors = errors + 1 if b.classify(t[1]) != t.first
|
29
|
+
end
|
30
|
+
puts "Error rate #{(errors / (n*1.0))}"
|
31
|
+
}
|
32
|
+
|
33
|
+
# ClassifierReborn
|
34
|
+
puts Benchmark.measure {
|
35
|
+
classifier = ClassifierReborn::Bayes.new auto_categorize: true
|
36
|
+
training.each do |t|
|
37
|
+
classifier.train t.first, t[1]
|
38
|
+
end
|
39
|
+
|
40
|
+
errors = 0
|
41
|
+
|
42
|
+
test.take(n).each do |t|
|
43
|
+
errors = errors + 1 if classifier.classify(t[1]).downcase != t.first
|
44
|
+
end
|
45
|
+
puts "Error rate #{(errors / (n*1.0))}"
|
46
|
+
}
|
data/bin/setup
CHANGED
data/ext/Makefile
CHANGED
data/ext/fast_bayes/Makefile
CHANGED
@@ -129,7 +129,7 @@ CFLAGS = $(CCDLFLAGS) $(cflags) -fPIC $(ARCH_FLAG)
|
|
129
129
|
INCFLAGS = -I. -I$(arch_hdrdir) -I$(hdrdir)/ruby/backward -I$(hdrdir) -I$(srcdir)
|
130
130
|
DEFS =
|
131
131
|
CPPFLAGS = -I/home/chang/.rbenv/versions/2.3.1/include $(DEFS) $(cppflags) -I/home/chang/.rbenv/versions/2.3.1/lib/ruby/gems/2.3.0/gems/rice-2.1.0/ruby/lib/include
|
132
|
-
CXXFLAGS = $(CFLAGS) -Wall -g -std=c++11
|
132
|
+
CXXFLAGS = $(CFLAGS) -Wall -g -std=c++11 -Ofast -g0
|
133
133
|
ldflags = -L. -L/home/chang/.rbenv/versions/2.3.1/lib -fstack-protector -rdynamic -Wl,-export-dynamic -L/home/chang/.rbenv/versions/2.3.1/lib/ruby/gems/2.3.0/gems/rice-2.1.0/ruby/lib/lib -lrice
|
134
134
|
dldflags =
|
135
135
|
ARCH_FLAG =
|
@@ -187,13 +187,13 @@ DISTCLEANDIRS =
|
|
187
187
|
|
188
188
|
extout =
|
189
189
|
extout_prefix =
|
190
|
-
target_prefix =
|
190
|
+
target_prefix = /fast_bayes
|
191
191
|
LOCAL_LIBS =
|
192
192
|
LIBS = -Wl,-R/home/chang/.rbenv/versions/2.3.1/lib -L/home/chang/.rbenv/versions/2.3.1/lib -lruby -lpthread -lgmp -ldl -lcrypt -lm -lc
|
193
193
|
ORIG_SRCS = main.cpp wrapper.cpp
|
194
194
|
SRCS = $(ORIG_SRCS)
|
195
195
|
OBJS = main.o wrapper.o
|
196
|
-
HDRS = $(srcdir)/bayes.h
|
196
|
+
HDRS = $(srcdir)/stopwords.h $(srcdir)/bayes.h
|
197
197
|
TARGET = fast_bayes
|
198
198
|
TARGET_NAME = fast_bayes
|
199
199
|
TARGET_ENTRY = Init_$(TARGET_NAME)
|
@@ -237,7 +237,7 @@ distclean: clean distclean-so distclean-static distclean-rb-default distclean-rb
|
|
237
237
|
realclean: distclean
|
238
238
|
install: install-so install-rb
|
239
239
|
|
240
|
-
install-so: $(DLLIB) $(TIMESTAMP_DIR)/.RUBYARCHDIR.time
|
240
|
+
install-so: $(DLLIB) $(TIMESTAMP_DIR)/.RUBYARCHDIR.-.fast_bayes.time
|
241
241
|
$(INSTALL_PROG) $(DLLIB) $(RUBYARCHDIR)
|
242
242
|
clean-static::
|
243
243
|
-$(Q)$(RM) $(STATIC_LIB)
|
@@ -247,7 +247,7 @@ pre-install-rb: Makefile
|
|
247
247
|
pre-install-rb-default: Makefile
|
248
248
|
pre-install-rb-default:
|
249
249
|
@$(NULLCMD)
|
250
|
-
$(TIMESTAMP_DIR)/.RUBYARCHDIR.time:
|
250
|
+
$(TIMESTAMP_DIR)/.RUBYARCHDIR.-.fast_bayes.time:
|
251
251
|
$(Q) $(MAKEDIRS) $(@D) $(RUBYARCHDIR)
|
252
252
|
$(Q) $(TOUCH) $@
|
253
253
|
|
@@ -306,7 +306,7 @@ site-install-rb: install-rb
|
|
306
306
|
$(Q) $(CC) $(INCFLAGS) $(CPPFLAGS) $(CFLAGS) $(COUTFLAG)$@ -S $<
|
307
307
|
|
308
308
|
$(DLLIB): $(OBJS) Makefile
|
309
|
-
$(ECHO) linking shared-object
|
309
|
+
$(ECHO) linking shared-object fast_bayes/$(DLLIB)
|
310
310
|
-$(Q)$(RM) $(@)
|
311
311
|
$(Q) $(LDSHAREDXX) -o $@ $(OBJS) $(LIBPATH) $(DLDFLAGS) $(LOCAL_LIBS) $(LIBS)
|
312
312
|
|
data/ext/fast_bayes/bayes.h
CHANGED
@@ -2,11 +2,12 @@
|
|
2
2
|
#define bayes
|
3
3
|
|
4
4
|
#include <unordered_map>
|
5
|
+
#include <unordered_set>
|
5
6
|
#include <map>
|
6
7
|
#include <vector>
|
7
8
|
#include <limits>
|
8
9
|
#include <cmath>
|
9
|
-
#include
|
10
|
+
#include "stopwords.h"
|
10
11
|
|
11
12
|
using namespace std;
|
12
13
|
typedef long long ll;
|
@@ -22,51 +23,61 @@ class Bayes {
|
|
22
23
|
ll count = 0;
|
23
24
|
// Total amounts of each term
|
24
25
|
unordered_map<string, ll> term_counts;
|
25
|
-
|
26
|
+
// Use english by default, TODO add option to use different ones
|
27
|
+
const string default_stopwords = "ext/fast_bayes/stopwords/en";
|
28
|
+
unordered_set<string> stopwords;
|
26
29
|
|
27
|
-
double
|
30
|
+
double smoothing = 0.00000000000001;
|
31
|
+
|
32
|
+
double estimate_priori(const string &label) {
|
28
33
|
return (priori_counts[label] / (double) count);
|
29
34
|
}
|
30
35
|
|
31
36
|
// Occurences of term in class divided by sum of all occurences of term
|
32
|
-
double estimate_term(string term, string label) {
|
37
|
+
double estimate_term(const string &term, const string &label) {
|
33
38
|
return (word_counts[label][term] + smoothing) / ((smoothing * term_counts.size()) + term_counts[term]);
|
34
39
|
}
|
35
40
|
|
36
|
-
vector<string> tokenize_string(string data) {
|
37
|
-
vector<string> v;
|
38
|
-
string buf;
|
39
|
-
stringstream ss(data);
|
40
|
-
while(ss >> buf) v.push_back(buf);
|
41
|
-
return v;
|
42
|
-
}
|
43
|
-
|
44
41
|
public:
|
45
|
-
Bayes() {
|
42
|
+
Bayes() {
|
43
|
+
stopwords = read_stopwords(default_stopwords);
|
44
|
+
}
|
46
45
|
|
47
46
|
// Add a new observation
|
48
|
-
void observe(string data, string label) {
|
47
|
+
void observe(const string &data, const string &label) {
|
49
48
|
priori_counts[label]++;
|
50
49
|
count++;
|
51
|
-
for(string term : tokenize_string(data)) {
|
52
|
-
word_counts[label][term]++;
|
53
|
-
term_counts[term]++;
|
54
|
-
}
|
55
50
|
|
51
|
+
int prev = 0;
|
52
|
+
// Split by whitespace,
|
53
|
+
for(unsigned int i = 0; i < data.length(); i++) {
|
54
|
+
if (data[i] == ' ') {
|
55
|
+
const string term = data.substr(prev, i-prev);
|
56
|
+
|
57
|
+
if (term.length() < 2 || stopwords.find(term) != stopwords.end()) continue;
|
58
|
+
word_counts[label][term]++;
|
59
|
+
term_counts[term]++;
|
60
|
+
prev = i+1;
|
61
|
+
}
|
62
|
+
}
|
56
63
|
}
|
57
64
|
|
58
|
-
string classify(string data) {
|
65
|
+
string classify(const string &data) {
|
59
66
|
string best_class;
|
60
67
|
double best_score = -numeric_limits<double>::infinity();
|
61
68
|
|
62
|
-
for(auto label_pair : priori_counts) {
|
63
|
-
string label = label_pair.first;
|
69
|
+
for(const auto &label_pair : priori_counts) {
|
70
|
+
const string label = label_pair.first;
|
64
71
|
|
65
72
|
// Score for a single label given the data
|
66
73
|
double score = log(estimate_priori(label));
|
67
|
-
|
68
|
-
|
69
|
-
|
74
|
+
int prev = 0;
|
75
|
+
// Split by whitespace,
|
76
|
+
for(unsigned int i = 0; i < data.length(); i++) {
|
77
|
+
if (data[i] == ' ') {
|
78
|
+
score += log(estimate_term(data.substr(prev, i-prev), label));
|
79
|
+
prev = i+1;
|
80
|
+
}
|
70
81
|
}
|
71
82
|
|
72
83
|
if (score >= best_score) {
|
data/ext/fast_bayes/extconf.rb
CHANGED
@@ -0,0 +1,23 @@
|
|
1
|
+
#ifndef STOPWORDS
|
2
|
+
#define STOPWORDS
|
3
|
+
#include <unordered_set>
|
4
|
+
#include <fstream>
|
5
|
+
#include <iostream>
|
6
|
+
using namespace std;
|
7
|
+
|
8
|
+
inline unordered_set<string> read_stopwords(const string filename) {
|
9
|
+
ios_base::sync_with_stdio(0);
|
10
|
+
|
11
|
+
unordered_set<string> stopwords;
|
12
|
+
ifstream stream(filename);
|
13
|
+
string word;
|
14
|
+
|
15
|
+
while(getline(stream, word)) {
|
16
|
+
stopwords.insert(word);
|
17
|
+
}
|
18
|
+
|
19
|
+
stream.close();
|
20
|
+
return stopwords;
|
21
|
+
}
|
22
|
+
|
23
|
+
#endif
|
@@ -0,0 +1,174 @@
|
|
1
|
+
a
|
2
|
+
about
|
3
|
+
above
|
4
|
+
after
|
5
|
+
again
|
6
|
+
against
|
7
|
+
all
|
8
|
+
am
|
9
|
+
an
|
10
|
+
and
|
11
|
+
any
|
12
|
+
are
|
13
|
+
aren't
|
14
|
+
as
|
15
|
+
at
|
16
|
+
be
|
17
|
+
because
|
18
|
+
been
|
19
|
+
before
|
20
|
+
being
|
21
|
+
below
|
22
|
+
between
|
23
|
+
both
|
24
|
+
but
|
25
|
+
by
|
26
|
+
can't
|
27
|
+
cannot
|
28
|
+
could
|
29
|
+
couldn't
|
30
|
+
did
|
31
|
+
didn't
|
32
|
+
do
|
33
|
+
does
|
34
|
+
doesn't
|
35
|
+
doing
|
36
|
+
don't
|
37
|
+
down
|
38
|
+
during
|
39
|
+
each
|
40
|
+
few
|
41
|
+
for
|
42
|
+
from
|
43
|
+
further
|
44
|
+
had
|
45
|
+
hadn't
|
46
|
+
has
|
47
|
+
hasn't
|
48
|
+
have
|
49
|
+
haven't
|
50
|
+
having
|
51
|
+
he
|
52
|
+
he'd
|
53
|
+
he'll
|
54
|
+
he's
|
55
|
+
her
|
56
|
+
here
|
57
|
+
here's
|
58
|
+
hers
|
59
|
+
herself
|
60
|
+
him
|
61
|
+
himself
|
62
|
+
his
|
63
|
+
how
|
64
|
+
how's
|
65
|
+
i
|
66
|
+
i'd
|
67
|
+
i'll
|
68
|
+
i'm
|
69
|
+
i've
|
70
|
+
if
|
71
|
+
in
|
72
|
+
into
|
73
|
+
is
|
74
|
+
isn't
|
75
|
+
it
|
76
|
+
it's
|
77
|
+
its
|
78
|
+
itself
|
79
|
+
let's
|
80
|
+
me
|
81
|
+
more
|
82
|
+
most
|
83
|
+
mustn't
|
84
|
+
my
|
85
|
+
myself
|
86
|
+
no
|
87
|
+
nor
|
88
|
+
not
|
89
|
+
of
|
90
|
+
off
|
91
|
+
on
|
92
|
+
once
|
93
|
+
only
|
94
|
+
or
|
95
|
+
other
|
96
|
+
ought
|
97
|
+
our
|
98
|
+
ours
|
99
|
+
ourselves
|
100
|
+
out
|
101
|
+
over
|
102
|
+
own
|
103
|
+
same
|
104
|
+
shan't
|
105
|
+
she
|
106
|
+
she'd
|
107
|
+
she'll
|
108
|
+
she's
|
109
|
+
should
|
110
|
+
shouldn't
|
111
|
+
so
|
112
|
+
some
|
113
|
+
such
|
114
|
+
than
|
115
|
+
that
|
116
|
+
that's
|
117
|
+
the
|
118
|
+
their
|
119
|
+
theirs
|
120
|
+
them
|
121
|
+
themselves
|
122
|
+
then
|
123
|
+
there
|
124
|
+
there's
|
125
|
+
these
|
126
|
+
they
|
127
|
+
they'd
|
128
|
+
they'll
|
129
|
+
they're
|
130
|
+
they've
|
131
|
+
this
|
132
|
+
those
|
133
|
+
through
|
134
|
+
to
|
135
|
+
too
|
136
|
+
under
|
137
|
+
until
|
138
|
+
up
|
139
|
+
very
|
140
|
+
was
|
141
|
+
wasn't
|
142
|
+
we
|
143
|
+
we'd
|
144
|
+
we'll
|
145
|
+
we're
|
146
|
+
we've
|
147
|
+
were
|
148
|
+
weren't
|
149
|
+
what
|
150
|
+
what's
|
151
|
+
when
|
152
|
+
when's
|
153
|
+
where
|
154
|
+
where's
|
155
|
+
which
|
156
|
+
while
|
157
|
+
who
|
158
|
+
who's
|
159
|
+
whom
|
160
|
+
why
|
161
|
+
why's
|
162
|
+
with
|
163
|
+
won't
|
164
|
+
would
|
165
|
+
wouldn't
|
166
|
+
you
|
167
|
+
you'd
|
168
|
+
you'll
|
169
|
+
you're
|
170
|
+
you've
|
171
|
+
your
|
172
|
+
yours
|
173
|
+
yourself
|
174
|
+
yourselves
|