RubyGems - fast_bayes - Versions diffs - 1.0.1 → 1.1.0 - Mend

fast_bayes 1.0.1 → 1.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (18) hide show

checksums.yaml +4 -4
data/.travis.yml +10 -2
data/README.md +16 -3
data/benchmarks/bench.rb +46 -0
data/bin/setup +1 -2
data/ext/Makefile +1 -1
data/ext/fast_bayes/Makefile +6 -6
data/ext/fast_bayes/bayes.h +35 -24
data/ext/fast_bayes/extconf.rb +2 -0
data/ext/fast_bayes/stopwords.h +23 -0
data/ext/fast_bayes/stopwords/en +174 -0
data/ext/test/classification_test.cpp +5 -5
data/ext/test/data/r8-test-all-terms.txt +2189 -0
data/ext/test/data/r8-train-all-terms.txt +5485 -0
data/fast_bayes.gemspec +1 -0
data/lib/fast_bayes.rb +20 -2
data/lib/fast_bayes/version.rb +1 -1
metadata +27 -2

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA1:
-  metadata.gz: 8e4ec075bc148f521ccb5520c1981228f2f8a2ad
-  data.tar.gz: 3c0bbde8575af7b72eab0c319cda8152cd26b2d6
+  metadata.gz: dc4b2951cd54f309119ba86832defb867967acd4
+  data.tar.gz: 9ac9c0aa6d4925229a3f2255171358c8d2146c37
 SHA512:
-  metadata.gz: 45628021d7d96127764e299112f1aa9402d5318882dc28a1fbe098151fd71a5b174814efd1ec5909e0105010d07a1ae10e905d6546f27dd48087491c387daffb
-  data.tar.gz: 7227ac4d1184f3e265554283ae20d05a113dff89d824532212a9484bd871069914b2fcad64c0ad1126143f5b96ec78c9671381b93629009fe7b50d45014fe7ac
+  metadata.gz: bc0e2b924f9c6d866b248e271923d73dc1232ba386505bfa8ee9af589d36c6da37afce753d9c6b464d7ade32edfab3a832aa7349d89a429720311517ab745a23
+  data.tar.gz: ac85cb50805f784e8b78c5dcfcfa2fd91af017ef7569bf12b094971e67b2d9d780e8e4e173ddd666afd6a70d7cc87db0419ef4e3c3087ceffc43061e3600f9f0

data/.travis.yml CHANGED Viewed

@@ -1,5 +1,13 @@
-sudo: false
+sudo: required
+dist: trusty
+compiler:
+    - clang
+    - gcc
 language: ruby
 rvm:
   - 2.3.1
-before_install: gem install bundler -v 1.13.0
+before_install:
+  - bin/setup
+  - bundle exec rspec

data/README.md CHANGED Viewed

@@ -1,9 +1,11 @@
-# FastBayes
+# FastBayes [![Build Status](https://travis-ci.org/Coolnesss/fast-bayes.svg?branch=master)](https://travis-ci.org/Coolnesss/fast-bayes)
 A fast implementation of the naive Bayes classification algorithm. Written in C++ with an interface for Ruby using [Rice](https://github.com/jasonroelofs/rice).
 Performs text classification with no separate training step needed, the cost of training is split between classification and observation. This is especially useful when data is an online stream, as the system can gradually improve.
+FastBayes supports any number of classes and they don't need to be added in advance.
 ## Installation
 Add this line to your application's Gemfile:
@@ -22,7 +24,18 @@ Or install it yourself as:
 ## Usage
-TODO: Write usage instructions here
+```ruby
+[1] pry(main)> require 'fast_bayes'
+=> true
+[2] pry(main)> b = FastBayes.new
+=> #<FastBayes:0x00000002cb5d98>
+[3] pry(main)> b.observe "This sentence is good", "Good"
+=> nil
+[4] pry(main)> b.observe "This sentence is bad", "Bad"
+=> nil
+[5] pry(main)> b.classify "good stuff"
+=> "Good"
+```
 ## Development
@@ -32,7 +45,7 @@ To install this gem onto your local machine, run `bundle exec rake install`. To
 ## Contributing
-Bug reports and pull requests are welcome on GitHub at https://github.com/[USERNAME]/fast_bayes.
+Bug reports and pull requests are welcome on GitHub at https://github.com/Coolnesss/fast-bayes.
 ## License

data/benchmarks/bench.rb ADDED Viewed

@@ -0,0 +1,46 @@
+require 'benchmark'
+require '/home/chang/fast_bayes/lib/fast_bayes/fast_bayes'
+require 'classifier-reborn' # Install from rubygems
+# Read newspaper data
+# Data from http://ana.cachopo.org/datasets-for-single-label-text-categorization
+d = IO.read("ext/test/data/r8-train-all-terms.txt")
+training = d.split("\n").map{|x| x.split("\t")}
+d = IO.read("ext/test/data/r8-test-all-terms.txt")
+test = d.split("\n").map{|x| x.split("\t")}
+test = test.shuffle
+n = test.size
+# FastBayes
+puts Benchmark.measure {
+  b = FastBayes.new
+  # Train all the training samples
+  training.each do |t|
+      b.observe(t[1], t.first)
+  end
+  errors = 0
+  # Classify random test data
+  test.take(n).each do |t|
+      errors = errors + 1 if b.classify(t[1]) != t.first
+  end
+  puts "Error rate #{(errors / (n*1.0))}"
+}
+# ClassifierReborn
+puts Benchmark.measure {
+  classifier =  ClassifierReborn::Bayes.new auto_categorize: true
+  training.each do |t|
+    classifier.train t.first, t[1]
+  end
+  errors = 0
+  test.take(n).each do |t|
+      errors = errors + 1 if classifier.classify(t[1]).downcase != t.first
+  end
+  puts "Error rate #{(errors / (n*1.0))}"
+}

data/bin/setup CHANGED Viewed

@@ -4,5 +4,4 @@ IFS=$'\n\t'
 set -vx
 bundle install
-# Do any other automated setup that you need to do here
+rake compile

data/ext/Makefile CHANGED Viewed

@@ -1,5 +1,5 @@
 debug:
-	clang++ -std=c++11 -g ./test/*.cpp
+	clang++ -std=c++11 -O3 ./test/*.cpp
 test:
 	clang++ -std=c++11 -O3 ./test/*.cpp && ./a.out
 main:

data/ext/fast_bayes/Makefile CHANGED Viewed

@@ -129,7 +129,7 @@ CFLAGS   = $(CCDLFLAGS) $(cflags)  -fPIC $(ARCH_FLAG)
 INCFLAGS = -I. -I$(arch_hdrdir) -I$(hdrdir)/ruby/backward -I$(hdrdir) -I$(srcdir)
 DEFS     =
 CPPFLAGS =  -I/home/chang/.rbenv/versions/2.3.1/include  $(DEFS) $(cppflags)  -I/home/chang/.rbenv/versions/2.3.1/lib/ruby/gems/2.3.0/gems/rice-2.1.0/ruby/lib/include
-CXXFLAGS = $(CFLAGS)  -Wall -g -std=c++11
+CXXFLAGS = $(CFLAGS)  -Wall -g -std=c++11  -Ofast  -g0
 ldflags  = -L. -L/home/chang/.rbenv/versions/2.3.1/lib  -fstack-protector -rdynamic -Wl,-export-dynamic  -L/home/chang/.rbenv/versions/2.3.1/lib/ruby/gems/2.3.0/gems/rice-2.1.0/ruby/lib/lib -lrice
 dldflags =
 ARCH_FLAG =
@@ -187,13 +187,13 @@ DISTCLEANDIRS =
 extout =
 extout_prefix =
-target_prefix =
+target_prefix = /fast_bayes
 LOCAL_LIBS =
 LIBS = -Wl,-R/home/chang/.rbenv/versions/2.3.1/lib -L/home/chang/.rbenv/versions/2.3.1/lib -lruby -lpthread -lgmp -ldl -lcrypt -lm   -lc
 ORIG_SRCS = main.cpp wrapper.cpp
 SRCS = $(ORIG_SRCS)
 OBJS = main.o wrapper.o
-HDRS = $(srcdir)/bayes.h
+HDRS = $(srcdir)/stopwords.h $(srcdir)/bayes.h
 TARGET = fast_bayes
 TARGET_NAME = fast_bayes
 TARGET_ENTRY = Init_$(TARGET_NAME)
@@ -237,7 +237,7 @@ distclean: clean distclean-so distclean-static distclean-rb-default distclean-rb
 realclean: distclean
 install: install-so install-rb
-install-so: $(DLLIB) $(TIMESTAMP_DIR)/.RUBYARCHDIR.time
+install-so: $(DLLIB) $(TIMESTAMP_DIR)/.RUBYARCHDIR.-.fast_bayes.time
 	$(INSTALL_PROG) $(DLLIB) $(RUBYARCHDIR)
 clean-static::
 	-$(Q)$(RM) $(STATIC_LIB)
@@ -247,7 +247,7 @@ pre-install-rb: Makefile
 pre-install-rb-default: Makefile
 pre-install-rb-default:
 	@$(NULLCMD)
-$(TIMESTAMP_DIR)/.RUBYARCHDIR.time:
+$(TIMESTAMP_DIR)/.RUBYARCHDIR.-.fast_bayes.time:
 	$(Q) $(MAKEDIRS) $(@D) $(RUBYARCHDIR)
 	$(Q) $(TOUCH) $@
@@ -306,7 +306,7 @@ site-install-rb: install-rb
 	$(Q) $(CC) $(INCFLAGS) $(CPPFLAGS) $(CFLAGS) $(COUTFLAG)$@ -S $<
 $(DLLIB): $(OBJS) Makefile
-	$(ECHO) linking shared-object $(DLLIB)
+	$(ECHO) linking shared-object fast_bayes/$(DLLIB)
 	-$(Q)$(RM) $(@)
 	$(Q) $(LDSHAREDXX) -o $@ $(OBJS) $(LIBPATH) $(DLDFLAGS) $(LOCAL_LIBS) $(LIBS)

data/ext/fast_bayes/bayes.h CHANGED Viewed

@@ -2,11 +2,12 @@
 #define bayes
 #include <unordered_map>
+#include <unordered_set>
 #include <map>
 #include <vector>
 #include <limits>
 #include <cmath>
-#include <sstream>
+#include "stopwords.h"
 using namespace std;
 typedef long long ll;
@@ -22,51 +23,61 @@ class Bayes {
         ll count = 0;
         // Total amounts of each term
         unordered_map<string, ll> term_counts;
-        double smoothing = 0.5;
+        // Use english by default, TODO add option to use different ones
+        const string default_stopwords = "ext/fast_bayes/stopwords/en";
+        unordered_set<string> stopwords;
-        double estimate_priori(string label) {
+        double smoothing = 0.00000000000001;
+        double estimate_priori(const string &label) {
             return (priori_counts[label] / (double) count);
         }
         // Occurences of term in class divided by sum of all occurences of term
-        double estimate_term(string term, string label) {
+        double estimate_term(const string &term, const string &label) {
             return (word_counts[label][term] + smoothing) / ((smoothing * term_counts.size()) + term_counts[term]);
         }
-        vector<string> tokenize_string(string data) {
-            vector<string> v;
-            string buf;
-            stringstream ss(data);
-            while(ss >> buf) v.push_back(buf);
-            return v;
-        }
     public:
-        Bayes() {}
+        Bayes() {
+            stopwords = read_stopwords(default_stopwords);
+        }
         // Add a new observation
-        void observe(string data, string label) {
+        void observe(const string &data, const string &label) {
             priori_counts[label]++;
             count++;
-            for(string term : tokenize_string(data)) {
-                word_counts[label][term]++;
-                term_counts[term]++;
-            }
+            int prev = 0;
+            // Split by whitespace,
+            for(unsigned int i = 0; i < data.length(); i++) {
+                if (data[i] == ' ') {
+                    const string term = data.substr(prev, i-prev);
+                    if (term.length() < 2 || stopwords.find(term) != stopwords.end()) continue;
+                    word_counts[label][term]++;
+                    term_counts[term]++;
+                    prev = i+1;
+                }
+            }
         }
-        string classify(string data) {
+        string classify(const string &data) {
             string best_class;
             double best_score = -numeric_limits<double>::infinity();
-            for(auto label_pair : priori_counts) {
-                string label = label_pair.first;
+            for(const auto &label_pair : priori_counts) {
+                const string label = label_pair.first;
                 // Score for a single label given the data
                 double score = log(estimate_priori(label));
-                vector<string> terms = tokenize_string(data);
-                for(string term : terms) {
-                    score += log(estimate_term(term, label));
+                int prev = 0;
+                // Split by whitespace,
+                for(unsigned int i = 0; i < data.length(); i++) {
+                    if (data[i] == ' ') {
+                        score += log(estimate_term(data.substr(prev, i-prev), label));
+                        prev = i+1;
+                    }
                 }
                 if (score >= best_score) {

data/ext/fast_bayes/extconf.rb CHANGED Viewed

@@ -1,4 +1,6 @@
 require 'mkmf-rice'
 $CXXFLAGS += " -std=c++11 "
+$CXXFLAGS += " -Ofast "
+$CXXFLAGS += " -g0 "
 create_makefile('fast_bayes/fast_bayes')

data/ext/fast_bayes/stopwords.h ADDED Viewed

@@ -0,0 +1,23 @@
+#ifndef STOPWORDS
+#define STOPWORDS
+#include <unordered_set>
+#include <fstream>
+#include <iostream>
+using namespace std;
+inline unordered_set<string> read_stopwords(const string filename) {
+    ios_base::sync_with_stdio(0);
+    unordered_set<string> stopwords;
+    ifstream stream(filename);
+    string word;
+    while(getline(stream, word)) {
+        stopwords.insert(word);
+    }
+    stream.close();
+    return stopwords;
+}
+#endif

data/ext/fast_bayes/stopwords/en ADDED Viewed

@@ -0,0 +1,174 @@
+a
+about
+above
+after
+again
+against
+all
+am
+an
+and
+any
+are
+aren't
+as
+at
+be
+because
+been
+before
+being
+below
+between
+both
+but
+by
+can't
+cannot
+could
+couldn't
+did
+didn't
+do
+does
+doesn't
+doing
+don't
+down
+during
+each
+few
+for
+from
+further
+had
+hadn't
+has
+hasn't
+have
+haven't
+having
+he
+he'd
+he'll
+he's
+her
+here
+here's
+hers
+herself
+him
+himself
+his
+how
+how's
+i
+i'd
+i'll
+i'm
+i've
+if
+in
+into
+is
+isn't
+it
+it's
+its
+itself
+let's
+me
+more
+most
+mustn't
+my
+myself
+no
+nor
+not
+of
+off
+on
+once
+only
+or
+other
+ought
+our
+ours
+ourselves
+out
+over
+own
+same
+shan't
+she
+she'd
+she'll
+she's
+should
+shouldn't
+so
+some
+such
+than
+that
+that's
+the
+their
+theirs
+them
+themselves
+then
+there
+there's
+these
+they
+they'd
+they'll
+they're
+they've
+this
+those
+through
+to
+too
+under
+until
+up
+very
+was
+wasn't
+we
+we'd
+we'll
+we're
+we've
+were
+weren't
+what
+what's
+when
+when's
+where
+where's
+which
+while
+who
+who's
+whom
+why
+why's
+with
+won't
+would
+wouldn't
+you
+you'd
+you'll
+you're
+you've
+your
+yours
+yourself
+yourselves