fast_bayes 1.0.1 → 1.1.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 8e4ec075bc148f521ccb5520c1981228f2f8a2ad
4
- data.tar.gz: 3c0bbde8575af7b72eab0c319cda8152cd26b2d6
3
+ metadata.gz: dc4b2951cd54f309119ba86832defb867967acd4
4
+ data.tar.gz: 9ac9c0aa6d4925229a3f2255171358c8d2146c37
5
5
  SHA512:
6
- metadata.gz: 45628021d7d96127764e299112f1aa9402d5318882dc28a1fbe098151fd71a5b174814efd1ec5909e0105010d07a1ae10e905d6546f27dd48087491c387daffb
7
- data.tar.gz: 7227ac4d1184f3e265554283ae20d05a113dff89d824532212a9484bd871069914b2fcad64c0ad1126143f5b96ec78c9671381b93629009fe7b50d45014fe7ac
6
+ metadata.gz: bc0e2b924f9c6d866b248e271923d73dc1232ba386505bfa8ee9af589d36c6da37afce753d9c6b464d7ade32edfab3a832aa7349d89a429720311517ab745a23
7
+ data.tar.gz: ac85cb50805f784e8b78c5dcfcfa2fd91af017ef7569bf12b094971e67b2d9d780e8e4e173ddd666afd6a70d7cc87db0419ef4e3c3087ceffc43061e3600f9f0
data/.travis.yml CHANGED
@@ -1,5 +1,13 @@
1
- sudo: false
1
+ sudo: required
2
+ dist: trusty
3
+
4
+ compiler:
5
+ - clang
6
+ - gcc
7
+
2
8
  language: ruby
3
9
  rvm:
4
10
  - 2.3.1
5
- before_install: gem install bundler -v 1.13.0
11
+ before_install:
12
+ - bin/setup
13
+ - bundle exec rspec
data/README.md CHANGED
@@ -1,9 +1,11 @@
1
- # FastBayes
1
+ # FastBayes [![Build Status](https://travis-ci.org/Coolnesss/fast-bayes.svg?branch=master)](https://travis-ci.org/Coolnesss/fast-bayes)
2
2
 
3
3
  A fast implementation of the naive Bayes classification algorithm. Written in C++ with an interface for Ruby using [Rice](https://github.com/jasonroelofs/rice).
4
4
 
5
5
  Performs text classification with no separate training step needed, the cost of training is split between classification and observation. This is especially useful when data is an online stream, as the system can gradually improve.
6
6
 
7
+ FastBayes supports any number of classes and they don't need to be added in advance.
8
+
7
9
  ## Installation
8
10
 
9
11
  Add this line to your application's Gemfile:
@@ -22,7 +24,18 @@ Or install it yourself as:
22
24
 
23
25
  ## Usage
24
26
 
25
- TODO: Write usage instructions here
27
+ ```ruby
28
+ [1] pry(main)> require 'fast_bayes'
29
+ => true
30
+ [2] pry(main)> b = FastBayes.new
31
+ => #<FastBayes:0x00000002cb5d98>
32
+ [3] pry(main)> b.observe "This sentence is good", "Good"
33
+ => nil
34
+ [4] pry(main)> b.observe "This sentence is bad", "Bad"
35
+ => nil
36
+ [5] pry(main)> b.classify "good stuff"
37
+ => "Good"
38
+ ```
26
39
 
27
40
  ## Development
28
41
 
@@ -32,7 +45,7 @@ To install this gem onto your local machine, run `bundle exec rake install`. To
32
45
 
33
46
  ## Contributing
34
47
 
35
- Bug reports and pull requests are welcome on GitHub at https://github.com/[USERNAME]/fast_bayes.
48
+ Bug reports and pull requests are welcome on GitHub at https://github.com/Coolnesss/fast-bayes.
36
49
 
37
50
 
38
51
  ## License
@@ -0,0 +1,46 @@
1
+ require 'benchmark'
2
+ require '/home/chang/fast_bayes/lib/fast_bayes/fast_bayes'
3
+ require 'classifier-reborn' # Install from rubygems
4
+
5
+
6
+ # Read newspaper data
7
+ # Data from http://ana.cachopo.org/datasets-for-single-label-text-categorization
8
+ d = IO.read("ext/test/data/r8-train-all-terms.txt")
9
+ training = d.split("\n").map{|x| x.split("\t")}
10
+
11
+ d = IO.read("ext/test/data/r8-test-all-terms.txt")
12
+ test = d.split("\n").map{|x| x.split("\t")}
13
+ test = test.shuffle
14
+
15
+ n = test.size
16
+ # FastBayes
17
+ puts Benchmark.measure {
18
+ b = FastBayes.new
19
+ # Train all the training samples
20
+ training.each do |t|
21
+ b.observe(t[1], t.first)
22
+ end
23
+
24
+ errors = 0
25
+
26
+ # Classify random test data
27
+ test.take(n).each do |t|
28
+ errors = errors + 1 if b.classify(t[1]) != t.first
29
+ end
30
+ puts "Error rate #{(errors / (n*1.0))}"
31
+ }
32
+
33
+ # ClassifierReborn
34
+ puts Benchmark.measure {
35
+ classifier = ClassifierReborn::Bayes.new auto_categorize: true
36
+ training.each do |t|
37
+ classifier.train t.first, t[1]
38
+ end
39
+
40
+ errors = 0
41
+
42
+ test.take(n).each do |t|
43
+ errors = errors + 1 if classifier.classify(t[1]).downcase != t.first
44
+ end
45
+ puts "Error rate #{(errors / (n*1.0))}"
46
+ }
data/bin/setup CHANGED
@@ -4,5 +4,4 @@ IFS=$'\n\t'
4
4
  set -vx
5
5
 
6
6
  bundle install
7
-
8
- # Do any other automated setup that you need to do here
7
+ rake compile
data/ext/Makefile CHANGED
@@ -1,5 +1,5 @@
1
1
  debug:
2
- clang++ -std=c++11 -g ./test/*.cpp
2
+ clang++ -std=c++11 -O3 ./test/*.cpp
3
3
  test:
4
4
  clang++ -std=c++11 -O3 ./test/*.cpp && ./a.out
5
5
  main:
@@ -129,7 +129,7 @@ CFLAGS = $(CCDLFLAGS) $(cflags) -fPIC $(ARCH_FLAG)
129
129
  INCFLAGS = -I. -I$(arch_hdrdir) -I$(hdrdir)/ruby/backward -I$(hdrdir) -I$(srcdir)
130
130
  DEFS =
131
131
  CPPFLAGS = -I/home/chang/.rbenv/versions/2.3.1/include $(DEFS) $(cppflags) -I/home/chang/.rbenv/versions/2.3.1/lib/ruby/gems/2.3.0/gems/rice-2.1.0/ruby/lib/include
132
- CXXFLAGS = $(CFLAGS) -Wall -g -std=c++11
132
+ CXXFLAGS = $(CFLAGS) -Wall -g -std=c++11 -Ofast -g0
133
133
  ldflags = -L. -L/home/chang/.rbenv/versions/2.3.1/lib -fstack-protector -rdynamic -Wl,-export-dynamic -L/home/chang/.rbenv/versions/2.3.1/lib/ruby/gems/2.3.0/gems/rice-2.1.0/ruby/lib/lib -lrice
134
134
  dldflags =
135
135
  ARCH_FLAG =
@@ -187,13 +187,13 @@ DISTCLEANDIRS =
187
187
 
188
188
  extout =
189
189
  extout_prefix =
190
- target_prefix =
190
+ target_prefix = /fast_bayes
191
191
  LOCAL_LIBS =
192
192
  LIBS = -Wl,-R/home/chang/.rbenv/versions/2.3.1/lib -L/home/chang/.rbenv/versions/2.3.1/lib -lruby -lpthread -lgmp -ldl -lcrypt -lm -lc
193
193
  ORIG_SRCS = main.cpp wrapper.cpp
194
194
  SRCS = $(ORIG_SRCS)
195
195
  OBJS = main.o wrapper.o
196
- HDRS = $(srcdir)/bayes.h
196
+ HDRS = $(srcdir)/stopwords.h $(srcdir)/bayes.h
197
197
  TARGET = fast_bayes
198
198
  TARGET_NAME = fast_bayes
199
199
  TARGET_ENTRY = Init_$(TARGET_NAME)
@@ -237,7 +237,7 @@ distclean: clean distclean-so distclean-static distclean-rb-default distclean-rb
237
237
  realclean: distclean
238
238
  install: install-so install-rb
239
239
 
240
- install-so: $(DLLIB) $(TIMESTAMP_DIR)/.RUBYARCHDIR.time
240
+ install-so: $(DLLIB) $(TIMESTAMP_DIR)/.RUBYARCHDIR.-.fast_bayes.time
241
241
  $(INSTALL_PROG) $(DLLIB) $(RUBYARCHDIR)
242
242
  clean-static::
243
243
  -$(Q)$(RM) $(STATIC_LIB)
@@ -247,7 +247,7 @@ pre-install-rb: Makefile
247
247
  pre-install-rb-default: Makefile
248
248
  pre-install-rb-default:
249
249
  @$(NULLCMD)
250
- $(TIMESTAMP_DIR)/.RUBYARCHDIR.time:
250
+ $(TIMESTAMP_DIR)/.RUBYARCHDIR.-.fast_bayes.time:
251
251
  $(Q) $(MAKEDIRS) $(@D) $(RUBYARCHDIR)
252
252
  $(Q) $(TOUCH) $@
253
253
 
@@ -306,7 +306,7 @@ site-install-rb: install-rb
306
306
  $(Q) $(CC) $(INCFLAGS) $(CPPFLAGS) $(CFLAGS) $(COUTFLAG)$@ -S $<
307
307
 
308
308
  $(DLLIB): $(OBJS) Makefile
309
- $(ECHO) linking shared-object $(DLLIB)
309
+ $(ECHO) linking shared-object fast_bayes/$(DLLIB)
310
310
  -$(Q)$(RM) $(@)
311
311
  $(Q) $(LDSHAREDXX) -o $@ $(OBJS) $(LIBPATH) $(DLDFLAGS) $(LOCAL_LIBS) $(LIBS)
312
312
 
@@ -2,11 +2,12 @@
2
2
  #define bayes
3
3
 
4
4
  #include <unordered_map>
5
+ #include <unordered_set>
5
6
  #include <map>
6
7
  #include <vector>
7
8
  #include <limits>
8
9
  #include <cmath>
9
- #include <sstream>
10
+ #include "stopwords.h"
10
11
 
11
12
  using namespace std;
12
13
  typedef long long ll;
@@ -22,51 +23,61 @@ class Bayes {
22
23
  ll count = 0;
23
24
  // Total amounts of each term
24
25
  unordered_map<string, ll> term_counts;
25
- double smoothing = 0.5;
26
+ // Use english by default, TODO add option to use different ones
27
+ const string default_stopwords = "ext/fast_bayes/stopwords/en";
28
+ unordered_set<string> stopwords;
26
29
 
27
- double estimate_priori(string label) {
30
+ double smoothing = 0.00000000000001;
31
+
32
+ double estimate_priori(const string &label) {
28
33
  return (priori_counts[label] / (double) count);
29
34
  }
30
35
 
31
36
  // Occurences of term in class divided by sum of all occurences of term
32
- double estimate_term(string term, string label) {
37
+ double estimate_term(const string &term, const string &label) {
33
38
  return (word_counts[label][term] + smoothing) / ((smoothing * term_counts.size()) + term_counts[term]);
34
39
  }
35
40
 
36
- vector<string> tokenize_string(string data) {
37
- vector<string> v;
38
- string buf;
39
- stringstream ss(data);
40
- while(ss >> buf) v.push_back(buf);
41
- return v;
42
- }
43
-
44
41
  public:
45
- Bayes() {}
42
+ Bayes() {
43
+ stopwords = read_stopwords(default_stopwords);
44
+ }
46
45
 
47
46
  // Add a new observation
48
- void observe(string data, string label) {
47
+ void observe(const string &data, const string &label) {
49
48
  priori_counts[label]++;
50
49
  count++;
51
- for(string term : tokenize_string(data)) {
52
- word_counts[label][term]++;
53
- term_counts[term]++;
54
- }
55
50
 
51
+ int prev = 0;
52
+ // Split by whitespace,
53
+ for(unsigned int i = 0; i < data.length(); i++) {
54
+ if (data[i] == ' ') {
55
+ const string term = data.substr(prev, i-prev);
56
+
57
+ if (term.length() < 2 || stopwords.find(term) != stopwords.end()) continue;
58
+ word_counts[label][term]++;
59
+ term_counts[term]++;
60
+ prev = i+1;
61
+ }
62
+ }
56
63
  }
57
64
 
58
- string classify(string data) {
65
+ string classify(const string &data) {
59
66
  string best_class;
60
67
  double best_score = -numeric_limits<double>::infinity();
61
68
 
62
- for(auto label_pair : priori_counts) {
63
- string label = label_pair.first;
69
+ for(const auto &label_pair : priori_counts) {
70
+ const string label = label_pair.first;
64
71
 
65
72
  // Score for a single label given the data
66
73
  double score = log(estimate_priori(label));
67
- vector<string> terms = tokenize_string(data);
68
- for(string term : terms) {
69
- score += log(estimate_term(term, label));
74
+ int prev = 0;
75
+ // Split by whitespace,
76
+ for(unsigned int i = 0; i < data.length(); i++) {
77
+ if (data[i] == ' ') {
78
+ score += log(estimate_term(data.substr(prev, i-prev), label));
79
+ prev = i+1;
80
+ }
70
81
  }
71
82
 
72
83
  if (score >= best_score) {
@@ -1,4 +1,6 @@
1
1
  require 'mkmf-rice'
2
2
 
3
3
  $CXXFLAGS += " -std=c++11 "
4
+ $CXXFLAGS += " -Ofast "
5
+ $CXXFLAGS += " -g0 "
4
6
  create_makefile('fast_bayes/fast_bayes')
@@ -0,0 +1,23 @@
1
+ #ifndef STOPWORDS
2
+ #define STOPWORDS
3
+ #include <unordered_set>
4
+ #include <fstream>
5
+ #include <iostream>
6
+ using namespace std;
7
+
8
+ inline unordered_set<string> read_stopwords(const string filename) {
9
+ ios_base::sync_with_stdio(0);
10
+
11
+ unordered_set<string> stopwords;
12
+ ifstream stream(filename);
13
+ string word;
14
+
15
+ while(getline(stream, word)) {
16
+ stopwords.insert(word);
17
+ }
18
+
19
+ stream.close();
20
+ return stopwords;
21
+ }
22
+
23
+ #endif
@@ -0,0 +1,174 @@
1
+ a
2
+ about
3
+ above
4
+ after
5
+ again
6
+ against
7
+ all
8
+ am
9
+ an
10
+ and
11
+ any
12
+ are
13
+ aren't
14
+ as
15
+ at
16
+ be
17
+ because
18
+ been
19
+ before
20
+ being
21
+ below
22
+ between
23
+ both
24
+ but
25
+ by
26
+ can't
27
+ cannot
28
+ could
29
+ couldn't
30
+ did
31
+ didn't
32
+ do
33
+ does
34
+ doesn't
35
+ doing
36
+ don't
37
+ down
38
+ during
39
+ each
40
+ few
41
+ for
42
+ from
43
+ further
44
+ had
45
+ hadn't
46
+ has
47
+ hasn't
48
+ have
49
+ haven't
50
+ having
51
+ he
52
+ he'd
53
+ he'll
54
+ he's
55
+ her
56
+ here
57
+ here's
58
+ hers
59
+ herself
60
+ him
61
+ himself
62
+ his
63
+ how
64
+ how's
65
+ i
66
+ i'd
67
+ i'll
68
+ i'm
69
+ i've
70
+ if
71
+ in
72
+ into
73
+ is
74
+ isn't
75
+ it
76
+ it's
77
+ its
78
+ itself
79
+ let's
80
+ me
81
+ more
82
+ most
83
+ mustn't
84
+ my
85
+ myself
86
+ no
87
+ nor
88
+ not
89
+ of
90
+ off
91
+ on
92
+ once
93
+ only
94
+ or
95
+ other
96
+ ought
97
+ our
98
+ ours
99
+ ourselves
100
+ out
101
+ over
102
+ own
103
+ same
104
+ shan't
105
+ she
106
+ she'd
107
+ she'll
108
+ she's
109
+ should
110
+ shouldn't
111
+ so
112
+ some
113
+ such
114
+ than
115
+ that
116
+ that's
117
+ the
118
+ their
119
+ theirs
120
+ them
121
+ themselves
122
+ then
123
+ there
124
+ there's
125
+ these
126
+ they
127
+ they'd
128
+ they'll
129
+ they're
130
+ they've
131
+ this
132
+ those
133
+ through
134
+ to
135
+ too
136
+ under
137
+ until
138
+ up
139
+ very
140
+ was
141
+ wasn't
142
+ we
143
+ we'd
144
+ we'll
145
+ we're
146
+ we've
147
+ were
148
+ weren't
149
+ what
150
+ what's
151
+ when
152
+ when's
153
+ where
154
+ where's
155
+ which
156
+ while
157
+ who
158
+ who's
159
+ whom
160
+ why
161
+ why's
162
+ with
163
+ won't
164
+ would
165
+ wouldn't
166
+ you
167
+ you'd
168
+ you'll
169
+ you're
170
+ you've
171
+ your
172
+ yours
173
+ yourself
174
+ yourselves