RubyGems - lurn - Versions diffs - 0.1.1 → 0.1.2 - Mend

lurn 0.1.1 → 0.1.2

Files changed (22) hide show

checksums.yaml +4 -4
data/.circleci/config.yml +46 -0
data/README.md +11 -32
data/lib/lurn.rb +6 -2
data/lib/lurn/naive_bayes/base.rb +32 -0
data/lib/lurn/naive_bayes/bernoulli_naive_bayes.rb +17 -44
data/lib/lurn/naive_bayes/multinomial_naive_bayes.rb +65 -0
data/lib/lurn/neighbors/knn_base.rb +54 -0
data/lib/lurn/neighbors/knn_classifier.rb +23 -0
data/lib/lurn/neighbors/knn_regression.rb +20 -0
data/lib/lurn/text/bernoulli_vectorizer.rb +3 -6
data/lib/lurn/text/word_count_vectorizer.rb +65 -0
data/lib/lurn/text/word_tokenizer.rb +7 -2
data/lurn.gemspec +7 -9
data/readmes/evaluation/classifier_evaluator.md +21 -0
data/readmes/naive_bayes/bernoulli_naive_bayes.md +41 -0
data/readmes/naive_bayes/multinomial_naive_bayes.md +41 -0
data/readmes/neighbors/knn_classification.md +48 -0
data/readmes/neighbors/knn_regression.md +48 -0
data/readmes/text_processing/bernoulli_vectorizer.md +30 -0
metadata +69 -11
data/lib/lurn/version.rb +0 -3

checksums.yaml CHANGED

@@ -1,7 +1,7 @@
 ---
 SHA1:
-  metadata.gz: b628aa2df6567044144aebc12d52f284b0eb93e9
-  data.tar.gz: 7d6089b8ca48eb371e39288ae034543b6f447e9d
+  metadata.gz: 74be6a1bd3e76e61d34048367f8fafed76c39a46
+  data.tar.gz: 355ea667da4dd95845d00d8ebebc40636d740cab
 SHA512:
-  metadata.gz: 0140373cd80d2594c4c34e5c9959f043b1f33ff527b5dd5e8ebcba7173153ee2ed3766b2f49888780040dbbe1b04e6bfce656bf0cd0b21294a9c0e50898bd798
-  data.tar.gz: dd31fabb232408c405fe7e40a630723fa39ab30f28982b8db6f0f6cf3dfeca4fc4bf4dbaae5df889e48c101617208993ee01701b1e2f34e0d550c499299d1789
+  metadata.gz: 9dad1d2540818efd226bb029aca818b19f5995a3bfbe77392e2577236038f7c84237cb4b19203e0101da1830338d135ea321e27cbecd32664851994c5a100035
+  data.tar.gz: 4defd5fc70dcfbd3389ab184cf59764cf734bc8bda9347070e0c448cb7286ca7d1cd17ca926785191501ca030a4f8f3c74639fa4271a3f6d13cabe88d2a012d0

data/.circleci/config.yml ADDED

@@ -0,0 +1,46 @@
+# Ruby CircleCI 2.0 configuration file
+#
+# Check https://circleci.com/docs/2.0/language-ruby/ for more details
+#
+version: 2
+jobs:
+  build:
+    docker:
+      # specify the version you desire here
+       - image: circleci/ruby:2.4
+      # Specify service dependencies here if necessary
+      # CircleCI maintains a library of pre-built images
+      # documented at https://circleci.com/docs/2.0/circleci-images/
+      # - image: circleci/postgres:9.4
+    working_directory: ~/repo
+    steps:
+      - checkout
+      - run:
+          name: install dependencies
+          command: |
+            bundle install --jobs=4 --retry=3 --path vendor/bundle
+      # run tests!
+      - run:
+          name: run tests
+          command: |
+            mkdir /tmp/test-results
+            TEST_FILES="$(circleci tests glob "spec/**/*_spec.rb" | circleci tests split --split-by=timings)"
+            bundle exec rspec --format progress \
+                            --format RspecJunitFormatter \
+                            --out /tmp/test-results/rspec.xml \
+                            --format progress \
+                            -- \
+                            $TEST_FILES
+      # collect reports
+      - store_test_results:
+          path: /tmp/test-results
+      - store_artifacts:
+          path: /tmp/test-results
+          destination: test-results

data/README.md CHANGED

@@ -1,6 +1,6 @@
 # Lurn
-Lurn is a ruby gem for performing machine learning. The API and design patterns in Lurn are inspired by sklearn, an analogous library for Python.
+Lurn is a ruby gem for performing machine learning tasks. The API and design patterns in Lurn are inspired by scikit-learn, a popular machine learning library for Python.
 ## Installation
@@ -20,37 +20,16 @@ Or install it yourself as:
 ## Usage
-### Bernoulli Naive Bayes
-```ruby
-require 'lurn'
-documents = [
-  'ruby is a great programming language',
-  'the giants recently won the world series',
-  'java is a compiled programming language',
-  'the jets are a football team'
-]
-labels = ['computers','sports','computers','sports']
-# vectorizers take raw data and transform it to a set of features that our
-# model can understand - in this case an array of boolean values representing
-# the presence or absence of a word in text
-vectorizer = Lurn::Text::BernoulliVectorizer.new
-vectorizer.fit(documents)
-vectors = vectorizer.transform(documents)
-model = Lurn::NaiveBayes::BernoulliNaiveBayes.new
-model.fit(vectors, labels)
-new_vectors = vectorizer.transform(['programming is fun'])
-probabilities = model.predict_probabilities(new_vectors.first)
-# => [0.9715681919147049, 0.028431808085295614]
-# to get the class of the maximum probability, look at the same index of the
-# unique_labels attribute on the model
-model.unique_labels[0] # => 'computers'
-```
+- Naive Bayes
+  - [Bernoulli Naive Bayes](readmes/naive_bayes/bernoulli_naive_bayes.md)
+  - [Multinomial Naive Bayes](readmes/naive_bayes/multinomial_naive_bayes.md)
+- Nearest Neighbor Models
+  - [K Nearest Neighbor Regression](readmes/neighbors/knn_regression.md)
+  - [K Nearest Neighbor Classification](readmes/neighbors/knn_classification.md)
+- Text Processing
+  - [Bernoulli Vectorizer](readmes/text_processing/bernoulli_vectorizer.md)
+- Model Evaluation
+  - [ClassifierEvaluator](readmes/evaluation/classifier_evaluator.md)
 ## Development

data/lib/lurn.rb CHANGED

@@ -1,10 +1,14 @@
 require "daru"
-require "lurn/version"
 require "lurn/text/word_tokenizer"
 require "lurn/text/bernoulli_vectorizer"
+require "lurn/text/word_count_vectorizer"
+require "lurn/naive_bayes/base"
 require "lurn/naive_bayes/bernoulli_naive_bayes"
+require "lurn/naive_bayes/multinomial_naive_bayes"
 require "lurn/evaluation/classifier_evaluator"
+require "lurn/neighbors/knn_base"
+require "lurn/neighbors/knn_regression"
+require "lurn/neighbors/knn_classifier"
 module Lurn
-    # Your code goes here...
 end

data/lib/lurn/naive_bayes/base.rb ADDED

@@ -0,0 +1,32 @@
+module Lurn
+  module NaiveBayes
+    class Base
+      def predict_probabilities(vector)
+        log_probabilties = predict_log_probabilities(vector)
+        log_probabilties.map { |p| Math.exp(p) }
+      end
+      def max_class(vector)
+        log_probs = predict_log_probabilities(vector)
+        max_index = log_probs.index(log_probs.max)
+        unique_labels[max_index]
+      end
+      def max_probability(vector)
+        probs = predict_probabilities(vector)
+        probs.max
+      end
+      def predict_log_probabilities(vector)
+        vector = Vector.elements(vector)
+        jll = joint_log_likelihood(vector)
+        log_prob_x = Math.log(jll.map { |v| Math.exp(v) }.inject(:+))
+        jll.map{ |v| v - log_prob_x }
+      end
+    end
+  end
+end

data/lib/lurn/naive_bayes/bernoulli_naive_bayes.rb CHANGED

@@ -2,7 +2,7 @@ require 'matrix'
 module Lurn
   module NaiveBayes
-    class BernoulliNaiveBayes
+    class BernoulliNaiveBayes < Base
       attr_accessor :probability_matrix, :label_probabilities, :unique_labels
@@ -19,38 +19,7 @@ module Lurn
         document_count_matrix = build_document_count_matrix(vectors, labels)
         @probability_matrix = build_probability_matrix(document_count_matrix, labels)
-        @label_probabilities = @unique_labels.map { |l1| labels.select { |l2| l1 == l2 }.count.to_f / labels.count.to_f }
-      end
-      def predict_probabilities(vector)
-        log_probabilties = predict_log_probabilities(vector)
-        log_probabilties.map { |p| Math.exp(p) }
-      end
-      def predict_log_probabilities(vector)
-        probabilities = @unique_labels.map do |label|
-          joint_log_likelihood(vector, label)
-        end
-        log_prob_x = Math.log(probabilities.map { |v| Math.exp(v) }.sum)
-        probabilities.map { |p| p - log_prob_x }
-      end
-      def max_class(vector)
-        log_probs = predict_log_probabilities(vector)
-        max_index = log_probs.index(log_probs.max)
-        unique_labels[max_index]
-      end
-      def max_probability(vector)
-        probs = predict_probabilities(vector)
-        probs.max
+        @label_probabilities = @unique_labels.map { |l1| labels.count { |l2| l1 == l2 }.to_f / labels.count.to_f }
       end
       def to_h
@@ -64,11 +33,11 @@ module Lurn
       private
       def build_probability_matrix(document_count_matrix, labels)
-        probability_matrix = Array.new(@unique_labels.count) { Array.new(@feature_count) { 0.0 } }
+        probability_matrix = Array.new(@unique_labels.count) { Array.new(@feature_count, 0.0) }
         document_count_matrix.each_with_index do |value, row, col|
           label = @unique_labels[row]
-          label_frequency = labels.select { |l| l == label }.count
+          label_frequency = labels.count(label)
           probability_matrix[row][col] = Math.log((value.to_f + @k) / (label_frequency.to_f + (2.0 * @k)))
         end
@@ -77,7 +46,7 @@ module Lurn
       end
       def build_document_count_matrix(vectors, labels)
-        matrix = Array.new(@unique_labels.count) { Array.new(@feature_count) { 0 } }
+        matrix = Array.new(@unique_labels.count) { Array.new(@feature_count, 0) }
         vectors.each_with_index do |value, row, col|
           if value == true
@@ -90,16 +59,20 @@ module Lurn
         Matrix.rows(matrix)
       end
-      def joint_log_likelihood(vector, label)
-        label_index = @unique_labels.index(label)
+      def joint_log_likelihood(x)
+        jlls = []
-        vector = Vector.elements(vector.map { |e| e == true ? 1 : 0 })
-        probabilities = @probability_matrix.row(label_index)
-        neg_probs = probabilities.map { |prb| Math.log(1.0 - Math.exp(prb)) }
-        jll = vector.dot(probabilities - neg_probs)
-        jll += Math.log(@label_probabilities[label_index]) + neg_probs.sum
+        unique_labels.each_with_index do |label, label_index|
+          vector = Vector.elements(x.map { |e| e == true ? 1 : 0 })
+          probabilities = @probability_matrix.row(label_index)
+          neg_probs = probabilities.map { |prb| Math.log(1.0 - Math.exp(prb)) }
+          jll = vector.dot(probabilities - neg_probs)
+          jll += Math.log(@label_probabilities[label_index]) + neg_probs.inject(:+)
+          jlls.push jll
+        end
-        jll
+        jlls
       end
     end

data/lib/lurn/naive_bayes/multinomial_naive_bayes.rb ADDED

@@ -0,0 +1,65 @@
+module Lurn
+  module NaiveBayes
+    class MultinomialNaiveBayes < Base
+      attr_accessor :prior_probabilities, :probability_matrix, :unique_labels
+      def initialize
+      end
+      def fit(vectors, labels)
+        vectors = Matrix.rows(vectors)
+        @unique_labels = labels.uniq
+        @feature_count = vectors.column_size
+        count_matrix = build_count_matrix(vectors, labels)
+        @probability_matrix = build_probability_matrix(count_matrix, labels)
+        @prior_probabilities = @unique_labels.map do |l1|
+          labels.count { |l2| l1 == l2 }.to_f / labels.count.to_f
+        end
+      end
+      private
+      def build_probability_matrix(count_matrix, labels)
+        probability_matrix = Array.new(@unique_labels.count) { Array.new(@feature_count, 0.0) }
+        count_matrix.each_with_index do |value, row, col|
+          label = @unique_labels[row]
+          label_frequency = labels.count(label)
+          numerator = (value.to_f + 1.0)
+          denominator = count_matrix.row(row).inject(:+) + @feature_count
+          probability_matrix[row][col] = Math.log(numerator / denominator)
+        end
+        probability_matrix
+      end
+      def build_count_matrix(vectors, labels)
+        matrix = Array.new(@unique_labels.count) { Array.new(@feature_count, 0) }
+        vectors.each_with_index do |value, row, col|
+          label = labels[row]
+          label_index = @unique_labels.index(label)
+          matrix[label_index][col] += value
+        end
+        Matrix.rows(matrix)
+      end
+      def joint_log_likelihood(vector)
+        jlls = []
+        @unique_labels.each_with_index do |label, label_index|
+          probabilities = @probability_matrix[label_index]
+          jll = vector.dot(probabilities)
+          jll += Math.log(@prior_probabilities[label_index])
+          jlls.push(jll)
+        end
+        jlls
+      end
+    end
+  end
+end

data/lib/lurn/neighbors/knn_base.rb ADDED

@@ -0,0 +1,54 @@
+module Lurn
+  module Neighbors
+    class KNNBase
+      attr_accessor :predictors, :targets, :k
+      def initialize(k)
+        @k = k
+      end
+      # Trains the KNN regression model to predict the target variable
+      # based on the predictors. For KNN Regression all computation is
+      # deferred until the time of prediction so in this case the data
+      # is just stored.
+      #
+      # @param predictors [Array-like] An array of arrays containing the predictor data
+      # @param targets [Array-like] An array with the value you want to predict
+      def fit(predictors, targets)
+        @predictors = predictors.map { |pred| Vector.elements(pred) }
+        @targets = targets
+        nil
+      end
+      # Returns the predictors and target value for the k nearest neighbors for the vector parameter
+      #
+      # @param vector [Array-like] An array of the same length and type as the predictors used to train the model
+      # @return [Array, Array]
+      #   Returns two values. The first is an array of the predictors for the k nearest neighbors. The second is an
+      #   array of the corresponding target values for the k nearest neighbors.
+      def nearest_neighbors(vector)
+        vector = Vector.elements(vector)
+        distances = @predictors.map.with_index do |p, index|
+          { index: index, distance: euclidian_distance(p, vector), value: targets[index] }
+        end
+        distances.sort! { |x,y| x[:distance] <=> y[:distance] }
+        neighboring_predictors = distances.first(@k).map { |neighbor| @predictors[neighbor[:index]] }
+        neighboring_targets = distances.first(@k).map { |neighbor| @targets[neighbor[:index]] }
+        return neighboring_predictors, neighboring_targets
+      end
+      private
+      def euclidian_distance(vector1, vector2)
+        Math.sqrt((vector1 - vector2).map { |v| (v.abs)**2 }.inject(:+))
+      end
+    end
+  end
+end

data/lib/lurn/neighbors/knn_classifier.rb ADDED

@@ -0,0 +1,23 @@
+module Lurn
+  module Neighbors
+    class KNNClassifier < KNNBase
+      # Predicts the class of the given observation by selecting the most common class of the
+      # closest k training observations based on euclidian distance. In the case of a tie one winner
+      # will be chosen at random from the most frequent classes.
+      #
+      # @param vector [Array-like]
+      #   An array (or array-like) of the same length as the predictors used
+      #   to fit the model
+      # @return [Object] The predicted class
+      def predict(vector)
+        _, neighboring_targets = nearest_neighbors(vector)
+        class_frequencies = neighboring_targets.inject(Hash.new(0)) { |h,v| h[v] += 1; h }
+        neighboring_targets.max_by { |v| class_frequencies[v] }
+      end
+    end
+  end
+end

data/lib/lurn/neighbors/knn_regression.rb ADDED

@@ -0,0 +1,20 @@
+module Lurn
+  module Neighbors
+    class KNNRegression < KNNBase
+      # Predicts the value of the given observation by averaging the target value of the
+      # closest k predictor observations based on euclidian distance.
+      #
+      # @param vector [Array-like]
+      #   An array (or array-like) of the same length as the predictors used
+      #   to fit the model
+      # @return [Float] The predicted value
+      def predict(vector)
+        _, neighboring_targets = nearest_neighbors(vector)
+        neighboring_targets.inject(:+).to_f / neighboring_targets.length.to_f
+      end
+    end
+  end
+end

data/lib/lurn/text/bernoulli_vectorizer.rb CHANGED

@@ -17,7 +17,7 @@ module Lurn
       def fit(documents)
         @vocabulary = []
         tokenized_docs = tokenize_documents(documents)
-        @vocabulary = tokenized_docs.flatten.uniq.sort
+        @vocabulary = tokenized_docs.flatten(1).uniq.sort
         reduce_features(tokenized_docs)
       end
@@ -49,12 +49,9 @@ module Lurn
           end
         end
-        reduced_features = []
-        @vocabulary.each_with_index do |token, index|
+        reduced_features = @vocabulary.select.with_index do |token, index|
           freq = doc_frequencies[index]
-          if freq < @options[:max_df] && freq > @options[:min_df]
-            reduced_features.push token
-          end
+          @options[:min_df] < freq && freq < @options[:max_df]
         end
         @vocabulary = reduced_features

data/lib/lurn/text/word_count_vectorizer.rb ADDED

@@ -0,0 +1,65 @@
+module Lurn
+  module Text
+    class WordCountVectorizer
+      attr_accessor :tokenizer
+      attr_accessor :vocabulary
+      def initialize(options = {})
+        @tokenizer = options[:tokenizer] || WordTokenizer.new
+        @vocabulary = []
+        options[:max_df] ||= 50
+        options[:min_df] ||= 0
+        @options = options
+      end
+      def fit(documents)
+        @vocabulary = []
+        tokenized_docs = tokenize_documents(documents)
+        @vocabulary = tokenized_docs.flatten(1).uniq.sort
+        reduce_features(tokenized_docs)
+      end
+      def to_h
+        {
+          tokenizer_options: @tokenizer.to_h,
+          vocabulary: @vocabulary
+        }
+      end
+      def transform(documents)
+        documents.map do |document|
+          tokens = @tokenizer.tokenize(document)
+          @vocabulary.map do |word|
+            tokens.count word
+          end
+        end
+      end
+      private
+      def reduce_features(tokenized_docs)
+        doc_frequencies = Array.new(@vocabulary.length, 0)
+        tokenized_docs.each do |tokens|
+          tokens.each do |token|
+            vocab_index = @vocabulary.index(token)
+            doc_frequencies[vocab_index] += 1
+          end
+        end
+        reduced_features = @vocabulary.select.with_index do |token, index|
+          freq = doc_frequencies[index]
+          @options[:min_df] < freq && freq < @options[:max_df]
+        end
+        @vocabulary = reduced_features
+      end
+      def tokenize_documents(documents)
+        documents.map { |doc| @tokenizer.tokenize(doc).uniq }
+      end
+    end
+  end
+end

data/lib/lurn/text/word_tokenizer.rb CHANGED

@@ -21,17 +21,22 @@ module Lurn
         @options[:strip_punctuation] ||= false
         @options[:strip_stopwords] ||= false
         @options[:stem_words] ||= false
+        @options[:ngrams] ||= 1
       end
       def tokenize(document)
-        document = document.gsub(/[[:punct:]]/, '') if @options[:strip_punctuation] == true
-        document = document.gsub(/\s+/, ' ').split(" ")
+        document = document.gsub(/[[:punct:]]/, "") if @options[:strip_punctuation] == true
+        document = document.split("\s")
         if(@options[:stem_words])
           stemmer = Lingua::Stemmer.new(language: :en)
           document = document.map { |word| stemmer.stem(word) }
         end
+        if(@options[:ngrams] > 1)
+          document = document.each_cons(@options[:ngrams]).to_a
+        end
         document
       end

data/lurn.gemspec CHANGED

@@ -1,11 +1,7 @@
 # coding: utf-8
-lib = File.expand_path('../lib', __FILE__)
-$LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
-require 'lurn/version'
 Gem::Specification.new do |spec|
   spec.name          = "lurn"
-  spec.version       = Lurn::VERSION
+  spec.version       = "0.1.2"
   spec.authors       = ["daniel.carpenter"]
   spec.email         = ["daniel.carpenter01@gmail.com"]
@@ -21,13 +17,15 @@ Gem::Specification.new do |spec|
   spec.executables   = spec.files.grep(%r{^exe/}) { |f| File.basename(f) }
   spec.require_paths = ["lib"]
-  spec.add_dependency "terminal-table", "~> 1.7.3"
+  spec.add_dependency "terminal-table", "~> 1.8.0", '>= 1.8.0'
   spec.add_dependency "ruby-stemmer", "~> 0.9.6"
-  spec.add_dependency "daru", '~> 0.1.6'
+  spec.add_dependency "daru", "~> 0.2.1"
   spec.add_development_dependency "bundler", "~> 1.13"
   spec.add_development_dependency "rake", "~> 10.0"
   spec.add_development_dependency "rspec", "~> 3.0"
-  spec.add_development_dependency "awesome_print"
-  spec.add_development_dependency "byebug"
+  spec.add_development_dependency "awesome_print", "~> 0"
+  spec.add_development_dependency "byebug", "~> 10.0", ">= 10.0.2"
+  spec.add_development_dependency "rspec_junit_formatter", "~> 0.4", ">= 0.4.1"
+  spec.add_development_dependency "yard", "~> 0.9.9"
 end

data/readmes/evaluation/classifier_evaluator.md ADDED

@@ -0,0 +1,21 @@
+# Classifier Evaluator
+`Lurn::Evaluation::ClassifierEvaluator` provides some basic functionality for evaluating the performance of a classifier.
+## Example
+```
+actual_class = ['sports','science','science','sports']
+predicted_class = ['sports','sports','science','sports']
+eval = Lurn::Evaluation::ClassifierEvaluator.new predicted_class, actual_class
+print eval.summary
+# output
++-----------------+--------------------+--------+
+| Class           | Precision          | Recall |
++-----------------+--------------------+--------+
+| sports          | 0.6666666666666666 | 1.0    |
+| science         | 1.0                | 0.5    |
+| Overall Average | 0.8333333333333333 | 0.75   |
++-----------------+--------------------+--------+
+```

data/readmes/naive_bayes/bernoulli_naive_bayes.md ADDED

@@ -0,0 +1,41 @@
+### Bernoulli Naive Bayes
+Naive bayes is a bayesian model often used for text classification. Bernoulli Naive Bayes specifically classifies observations based on the presence or absence of a feature in an observation.
+Below is a simple text classification using Naive Bayes in Lurn.
+1. Start with some text documents
+  ```ruby
+  documents = [
+    'ruby is a great programming language',
+    'the giants recently won the world series',
+    'java is a compiled programming language',
+    'the jets are a football team'
+  ]
+  labels = ['computers','sports','computers','sports']
+  ```
+2. Convert them to arrays of booleans representing which words they contain (or don't contain). Lurn provides vectorizers for this purpose.
+  ```
+  vectorizer = Lurn::Text::BernoulliVectorizer.new
+  vectorizer.fit(documents)
+  vectors = vectorizer.transform(documents)
+  ```
+3. Initialize and train the model
+  ```
+  model = Lurn::NaiveBayes::BernoulliNaiveBayes.new
+  model.fit(vectors, labels)
+  ```
+4. Classify a new document
+  ```
+  new_vectors = vectorizer.transform(['programming is fun'])
+  # get the most probable class for the new document given the training data
+  model.max_class(new_vectors.first)
+  # get the probability score for the most probable class
+  model.max_probability(new_vectors.first)
+  ```

data/readmes/naive_bayes/multinomial_naive_bayes.md ADDED

@@ -0,0 +1,41 @@
+### Multinomial Naive Bayes
+Naive bayes is a bayesian model often used for text classification. Multinomial Naive Bayes specifically classifies observations based on variables with a multinomial distribution (a.k.a. numbers).
+Below is a simple text classification using Multinomial Naive Bayes in Lurn.
+1. Start with some text documents
+  ```ruby
+  documents = [
+    'ruby is a great programming language',
+    'the giants recently won the world series',
+    'java is a compiled programming language',
+    'the jets are a football team'
+  ]
+  labels = ['computers','sports','computers','sports']
+  ```
+2. Convert them to arrays of booleans representing which words they contain (or don't contain). Lurn provides vectorizers for this purpose.
+  ```
+  vectorizer = Lurn::Text::WordCountVectorizer.new
+  vectorizer.fit(documents)
+  vectors = vectorizer.transform(documents)
+  ```
+3. Initialize and train the model
+  ```
+  model = Lurn::NaiveBayes::MultinomialNaiveBayes.new
+  model.fit(vectors, labels)
+  ```
+4. Classify a new document
+  ```
+  new_vectors = vectorizer.transform(['programming is fun'])
+  # get the most probable class for the new document given the training data
+  model.max_class(new_vectors.first)
+  # get the probability score for the most probable class
+  model.max_probability(new_vectors.first)
+  ```

data/readmes/neighbors/knn_classification.md ADDED

@@ -0,0 +1,48 @@
+### K Nearest Neighbor Classifier
+K Nearest Neighbor (KNN) Classification is one of the simplest forms of classification
+in the machine learning toolbox. Training data is stored on the model and all
+computation is deferred until the time of prediction. When a new observation
+is provided it calculates the distance between the new observation and all
+training data in an n-dimensional space (where n is the number of variables).
+The predicted class is the most common class among the k closest training records.
+Below is a simple example of using KNN Classification in Lurn.
+Suppose we have a dataset containing the income, years of college eduction and job title
+for a set of people. We could use this as training data to predict
+people's job title based on their income and years of eduction.
+  ```ruby
+  people = [
+    # years of education  annual income job title
+    [ 4,                  50000,        'engineer'],
+    [ 6,                  60000,        'scientist'],
+    [ 2,                  40000,        'engineer'],
+    [ 8,                  90000,         'scientist'],
+    [ 4,                  70000,        'librarian'],
+  ]
+  # eduction and income
+  predictors = people.map { |person| person[0..1] }
+  # extract annual income
+  target_var = people.map { |person| person[2]}
+  ```
+The model can be trained by passing the predictors and target values to an initialized
+instance of the KNNClassifier model.
+```ruby
+  # initialize the model with a k of 2
+  model = Lurn::Neighbors::KNNClassifier.new(2)
+  model.fit(predictors, target_var)
+```
+The model can now be used to predict the income of a person given his/her
+age and years of education.
+```ruby
+  # predict the job title of person with 4 years of eduction who make $45,000
+  model.predict([4, 45000])  # => engineer
+```

data/readmes/neighbors/knn_regression.md ADDED

@@ -0,0 +1,48 @@
+### K Nearest Neighbor Regression
+K Nearest Neighbor (KNN) Regression is one of the simplest forms of regression
+in the machine learning toolbox. Training data is stored on the model and all
+computation is deferred until the time of prediction. When a new observation
+is provided it calculates the distance between the new observation and all
+training data in an n-dimensional space (where n is the number of variables).
+The predicted value is the average value of the k closest training records.
+Below is a simple example of using KNN Regression in Lurn.
+Suppose we have a dataset containing the age, years of college eduction and annual
+income for a set of people. We could use this as training data to predict
+people's annual income based on their age and years of eduction.
+  ```ruby
+  people = [
+    # age   years of education  annual income
+    [ 25,   4,                 50000],
+    [ 35,   6,                 60000],
+    [ 51,   2,                 40000],
+    [ 45,   8,                 90000],
+    [ 32,   4,                 70000],
+  ]
+  # extract age and eduction
+  predictors = people.map { |person| person[0..1] }
+  # extract annual income
+  target_var = people.map { |person| person[2]}
+  ```
+The model can be trained by passing the predictors and target values to an initialized
+instance of the KNNRegression model.
+```ruby
+  # initialize the model with a k of 2
+  model = Lurn::Neighbors::KNNRegression.new(2)
+  model.fit(predictors, target_var)
+```
+The model can now be used to predict the income of a person given his/her
+age and years of education.
+```ruby
+  # predict the income of a 31 year old person with 4 years of eduction
+  model.predict([31, 4])
+```

data/readmes/text_processing/bernoulli_vectorizer.md ADDED

@@ -0,0 +1,30 @@
+# Bernoulli Vectorizer (word presence vectorizer)
+A bernoulli document model is one that represents a piece of text as an array of boolean values. Each boolean represents the presence (true) or absence (false) of a word in the document.
+`Lurn::Text::BernoulliVectorizer` is intended to make it easy to convert text into Bernoulli vectors.
+## Basic example
+```
+docs = ['hello world', 'hello fred']
+vectorizer = Lurn::Text::BernoulliVectorizer.new
+# vectorizers must be trained in order to know
+# what features (words) exist in the data set
+vectorizer.fit(docs)
+vectorizer.transform(docs)
+```
+## Configuration
+The BernoulliVectorizer.new includes a number of options for configuring how documents are vectorized. A few include:
+- max_df[int]: Excludes words which appear in more than `max_df` documents
+- min_df[int]: Excludes words which appear in fewer than `min_df` documents
+- strip_stopwords[boolean]: Removes stop words if true
+- stem_words[boolean]: Stems words in the documents if true
+- ngrams[int]: Features will be determined based on groupings of `ngrams` consecutive words instead of individual words
+```
+Lurn::BernoulliVectorizer.new(strip_stopwords: true, min_df: 10)
+```

metadata CHANGED

@@ -1,14 +1,14 @@
 --- !ruby/object:Gem::Specification
 name: lurn
 version: !ruby/object:Gem::Version
-  version: 0.1.1
+  version: 0.1.2
 platform: ruby
 authors:
 - daniel.carpenter
 autorequire:
 bindir: exe
 cert_chain: []
-date: 2017-12-13 00:00:00.000000000 Z
+date: 2018-08-17 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   name: terminal-table
@@ -16,14 +16,20 @@ dependencies:
     requirements:
     - - "~>"
       - !ruby/object:Gem::Version
-        version: 1.7.3
+        version: 1.8.0
+    - - ">="
+      - !ruby/object:Gem::Version
+        version: 1.8.0
   type: :runtime
   prerelease: false
   version_requirements: !ruby/object:Gem::Requirement
     requirements:
     - - "~>"
       - !ruby/object:Gem::Version
-        version: 1.7.3
+        version: 1.8.0
+    - - ">="
+      - !ruby/object:Gem::Version
+        version: 1.8.0
 - !ruby/object:Gem::Dependency
   name: ruby-stemmer
   requirement: !ruby/object:Gem::Requirement
@@ -44,14 +50,14 @@ dependencies:
     requirements:
     - - "~>"
       - !ruby/object:Gem::Version
-        version: 0.1.6
+        version: 0.2.1
   type: :runtime
   prerelease: false
   version_requirements: !ruby/object:Gem::Requirement
     requirements:
     - - "~>"
       - !ruby/object:Gem::Version
-        version: 0.1.6
+        version: 0.2.1
 - !ruby/object:Gem::Dependency
   name: bundler
   requirement: !ruby/object:Gem::Requirement
@@ -98,30 +104,70 @@ dependencies:
   name: awesome_print
   requirement: !ruby/object:Gem::Requirement
     requirements:
-    - - ">="
+    - - "~>"
       - !ruby/object:Gem::Version
         version: '0'
   type: :development
   prerelease: false
   version_requirements: !ruby/object:Gem::Requirement
     requirements:
-    - - ">="
+    - - "~>"
       - !ruby/object:Gem::Version
         version: '0'
 - !ruby/object:Gem::Dependency
   name: byebug
   requirement: !ruby/object:Gem::Requirement
     requirements:
+    - - "~>"
+      - !ruby/object:Gem::Version
+        version: '10.0'
     - - ">="
       - !ruby/object:Gem::Version
-        version: '0'
+        version: 10.0.2
   type: :development
   prerelease: false
   version_requirements: !ruby/object:Gem::Requirement
     requirements:
+    - - "~>"
+      - !ruby/object:Gem::Version
+        version: '10.0'
     - - ">="
       - !ruby/object:Gem::Version
-        version: '0'
+        version: 10.0.2
+- !ruby/object:Gem::Dependency
+  name: rspec_junit_formatter
+  requirement: !ruby/object:Gem::Requirement
+    requirements:
+    - - "~>"
+      - !ruby/object:Gem::Version
+        version: '0.4'
+    - - ">="
+      - !ruby/object:Gem::Version
+        version: 0.4.1
+  type: :development
+  prerelease: false
+  version_requirements: !ruby/object:Gem::Requirement
+    requirements:
+    - - "~>"
+      - !ruby/object:Gem::Version
+        version: '0.4'
+    - - ">="
+      - !ruby/object:Gem::Version
+        version: 0.4.1
+- !ruby/object:Gem::Dependency
+  name: yard
+  requirement: !ruby/object:Gem::Requirement
+    requirements:
+    - - "~>"
+      - !ruby/object:Gem::Version
+        version: 0.9.9
+  type: :development
+  prerelease: false
+  version_requirements: !ruby/object:Gem::Requirement
+    requirements:
+    - - "~>"
+      - !ruby/object:Gem::Version
+        version: 0.9.9
 description: " A gem with tools for machine learning. "
 email:
 - daniel.carpenter01@gmail.com
@@ -129,6 +175,7 @@ executables: []
 extensions: []
 extra_rdoc_files: []
 files:
+- ".circleci/config.yml"
 - ".gitignore"
 - ".rspec"
 - ".travis.yml"
@@ -140,11 +187,22 @@ files:
 - bin/setup
 - lib/lurn.rb
 - lib/lurn/evaluation/classifier_evaluator.rb
+- lib/lurn/naive_bayes/base.rb
 - lib/lurn/naive_bayes/bernoulli_naive_bayes.rb
+- lib/lurn/naive_bayes/multinomial_naive_bayes.rb
+- lib/lurn/neighbors/knn_base.rb
+- lib/lurn/neighbors/knn_classifier.rb
+- lib/lurn/neighbors/knn_regression.rb
 - lib/lurn/text/bernoulli_vectorizer.rb
+- lib/lurn/text/word_count_vectorizer.rb
 - lib/lurn/text/word_tokenizer.rb
-- lib/lurn/version.rb
 - lurn.gemspec
+- readmes/evaluation/classifier_evaluator.md
+- readmes/naive_bayes/bernoulli_naive_bayes.md
+- readmes/naive_bayes/multinomial_naive_bayes.md
+- readmes/neighbors/knn_classification.md
+- readmes/neighbors/knn_regression.md
+- readmes/text_processing/bernoulli_vectorizer.md
 homepage: https://www.github.com/dansbits/lurn
 licenses:
 - MIT

data/lib/lurn/version.rb DELETED

@@ -1,3 +0,0 @@
-module Lurn
-  VERSION = "0.1.1"
-end