RubyGems - lurn - Versions diffs - 0.1.1 → 0.1.2 - Mend

lurn 0.1.1 → 0.1.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (22) hide show

checksums.yaml +4 -4
data/.circleci/config.yml +46 -0
data/README.md +11 -32
data/lib/lurn.rb +6 -2
data/lib/lurn/naive_bayes/base.rb +32 -0
data/lib/lurn/naive_bayes/bernoulli_naive_bayes.rb +17 -44
data/lib/lurn/naive_bayes/multinomial_naive_bayes.rb +65 -0
data/lib/lurn/neighbors/knn_base.rb +54 -0
data/lib/lurn/neighbors/knn_classifier.rb +23 -0
data/lib/lurn/neighbors/knn_regression.rb +20 -0
data/lib/lurn/text/bernoulli_vectorizer.rb +3 -6
data/lib/lurn/text/word_count_vectorizer.rb +65 -0
data/lib/lurn/text/word_tokenizer.rb +7 -2
data/lurn.gemspec +7 -9
data/readmes/evaluation/classifier_evaluator.md +21 -0
data/readmes/naive_bayes/bernoulli_naive_bayes.md +41 -0
data/readmes/naive_bayes/multinomial_naive_bayes.md +41 -0
data/readmes/neighbors/knn_classification.md +48 -0
data/readmes/neighbors/knn_regression.md +48 -0
data/readmes/text_processing/bernoulli_vectorizer.md +30 -0
metadata +69 -11
data/lib/lurn/version.rb +0 -3

checksums.yaml CHANGED

@@ -1,7 +1,7 @@
 ---
 SHA1:
-  metadata.gz: b628aa2df6567044144aebc12d52f284b0eb93e9
-  data.tar.gz: 7d6089b8ca48eb371e39288ae034543b6f447e9d
+  metadata.gz: 74be6a1bd3e76e61d34048367f8fafed76c39a46
+  data.tar.gz: 355ea667da4dd95845d00d8ebebc40636d740cab
 SHA512:
-  metadata.gz: 0140373cd80d2594c4c34e5c9959f043b1f33ff527b5dd5e8ebcba7173153ee2ed3766b2f49888780040dbbe1b04e6bfce656bf0cd0b21294a9c0e50898bd798
-  data.tar.gz: dd31fabb232408c405fe7e40a630723fa39ab30f28982b8db6f0f6cf3dfeca4fc4bf4dbaae5df889e48c101617208993ee01701b1e2f34e0d550c499299d1789
+  metadata.gz: 9dad1d2540818efd226bb029aca818b19f5995a3bfbe77392e2577236038f7c84237cb4b19203e0101da1830338d135ea321e27cbecd32664851994c5a100035
+  data.tar.gz: 4defd5fc70dcfbd3389ab184cf59764cf734bc8bda9347070e0c448cb7286ca7d1cd17ca926785191501ca030a4f8f3c74639fa4271a3f6d13cabe88d2a012d0

data/.circleci/config.yml ADDED

@@ -0,0 +1,46 @@
+# Ruby CircleCI 2.0 configuration file
+#
+# Check https://circleci.com/docs/2.0/language-ruby/ for more details
+#
+version: 2
+jobs:
+  build:
+    docker:
+      # specify the version you desire here
+       - image: circleci/ruby:2.4
+      # Specify service dependencies here if necessary
+      # CircleCI maintains a library of pre-built images
+      # documented at https://circleci.com/docs/2.0/circleci-images/
+      # - image: circleci/postgres:9.4
+    working_directory: ~/repo
+    steps:
+      - checkout
+      - run:
+          name: install dependencies
+          command: |
+            bundle install --jobs=4 --retry=3 --path vendor/bundle
+      # run tests!
+      - run:
+          name: run tests
+          command: |
+            mkdir /tmp/test-results
+            TEST_FILES="$(circleci tests glob "spec/**/*_spec.rb" | circleci tests split --split-by=timings)"
+            bundle exec rspec --format progress \
+                            --format RspecJunitFormatter \
+                            --out /tmp/test-results/rspec.xml \
+                            --format progress \
+                            -- \
+                            $TEST_FILES
+      # collect reports
+      - store_test_results:
+          path: /tmp/test-results
+      - store_artifacts:
+          path: /tmp/test-results
+          destination: test-results

data/README.md CHANGED

@@ -1,6 +1,6 @@
 # Lurn
-Lurn is a ruby gem for performing machine learning. The API and design patterns in Lurn are inspired by sklearn, an analogous library for Python.
+Lurn is a ruby gem for performing machine learning tasks. The API and design patterns in Lurn are inspired by scikit-learn, a popular machine learning library for Python.
 ## Installation
@@ -20,37 +20,16 @@ Or install it yourself as:
 ## Usage
-### Bernoulli Naive Bayes
-```ruby
-require 'lurn'
-documents = [
-  'ruby is a great programming language',
-  'the giants recently won the world series',
-  'java is a compiled programming language',
-  'the jets are a football team'
-]
-labels = ['computers','sports','computers','sports']
-# vectorizers take raw data and transform it to a set of features that our
-# model can understand - in this case an array of boolean values representing
-# the presence or absence of a word in text
-vectorizer = Lurn::Text::BernoulliVectorizer.new
-vectorizer.fit(documents)
-vectors = vectorizer.transform(documents)
-model = Lurn::NaiveBayes::BernoulliNaiveBayes.new
-model.fit(vectors, labels)
-new_vectors = vectorizer.transform(['programming is fun'])
-probabilities = model.predict_probabilities(new_vectors.first)
-# => [0.9715681919147049, 0.028431808085295614]
-# to get the class of the maximum probability, look at the same index of the
-# unique_labels attribute on the model
-model.unique_labels[0] # => 'computers'
-```
+- Naive Bayes
+  - [Bernoulli Naive Bayes](readmes/naive_bayes/bernoulli_naive_bayes.md)
+  - [Multinomial Naive Bayes](readmes/naive_bayes/multinomial_naive_bayes.md)
+- Nearest Neighbor Models
+  - [K Nearest Neighbor Regression](readmes/neighbors/knn_regression.md)
+  - [K Nearest Neighbor Classification](readmes/neighbors/knn_classification.md)
+- Text Processing
+  - [Bernoulli Vectorizer](readmes/text_processing/bernoulli_vectorizer.md)
+- Model Evaluation
+  - [ClassifierEvaluator](readmes/evaluation/classifier_evaluator.md)
 ## Development

data/lib/lurn.rb CHANGED

@@ -1,10 +1,14 @@
 require "daru"
-require "lurn/version"
 require "lurn/text/word_tokenizer"
 require "lurn/text/bernoulli_vectorizer"
+require "lurn/text/word_count_vectorizer"
+require "lurn/naive_bayes/base"
 require "lurn/naive_bayes/bernoulli_naive_bayes"
+require "lurn/naive_bayes/multinomial_naive_bayes"
 require "lurn/evaluation/classifier_evaluator"
+require "lurn/neighbors/knn_base"
+require "lurn/neighbors/knn_regression"
+require "lurn/neighbors/knn_classifier"
 module Lurn
-    # Your code goes here...
 end

data/lib/lurn/naive_bayes/base.rb ADDED

@@ -0,0 +1,32 @@
+module Lurn
+  module NaiveBayes
+    class Base
+      def predict_probabilities(vector)
+        log_probabilties = predict_log_probabilities(vector)
+        log_probabilties.map { |p| Math.exp(p) }
+      end
+      def max_class(vector)
+        log_probs = predict_log_probabilities(vector)
+        max_index = log_probs.index(log_probs.max)
+        unique_labels[max_index]
+      end
+      def max_probability(vector)
+        probs = predict_probabilities(vector)
+        probs.max
+      end
+      def predict_log_probabilities(vector)
+        vector = Vector.elements(vector)
+        jll = joint_log_likelihood(vector)
+        log_prob_x = Math.log(jll.map { |v| Math.exp(v) }.inject(:+))
+        jll.map{ |v| v - log_prob_x }
+      end
+    end
+  end
+end

data/lib/lurn/naive_bayes/bernoulli_naive_bayes.rb CHANGED

@@ -2,7 +2,7 @@ require 'matrix'
 module Lurn
   module NaiveBayes
-    class BernoulliNaiveBayes
+    class BernoulliNaiveBayes < Base
       attr_accessor :probability_matrix, :label_probabilities, :unique_labels
@@ -19,38 +19,7 @@ module Lurn
         document_count_matrix = build_document_count_matrix(vectors, labels)
         @probability_matrix = build_probability_matrix(document_count_matrix, labels)
-        @label_probabilities = @unique_labels.map { |l1| labels.select { |l2| l1 == l2 }.count.to_f / labels.count.to_f }
-      end
-      def predict_probabilities(vector)
-        log_probabilties = predict_log_probabilities(vector)
-        log_probabilties.map { |p| Math.exp(p) }
-      end
-      def predict_log_probabilities(vector)
-        probabilities = @unique_labels.map do |label|
-          joint_log_likelihood(vector, label)
-        end
-        log_prob_x = Math.log(probabilities.map { |v| Math.exp(v) }.sum)
-        probabilities.map { |p| p - log_prob_x }
-      end
-      def max_class(vector)
-        log_probs = predict_log_probabilities(vector)
-        max_index = log_probs.index(log_probs.max)
-        unique_labels[max_index]
-      end
-      def max_probability(vector)
-        probs = predict_probabilities(vector)
-        probs.max
+        @label_probabilities = @unique_labels.map { |l1| labels.count { |l2| l1 == l2 }.to_f / labels.count.to_f }
       end
       def to_h
@@ -64,11 +33,11 @@ module Lurn
       private
       def build_probability_matrix(document_count_matrix, labels)
-        probability_matrix = Array.new(@unique_labels.count) { Array.new(@feature_count) { 0.0 } }
+        probability_matrix = Array.new(@unique_labels.count) { Array.new(@feature_count, 0.0) }
         document_count_matrix.each_with_index do |value, row, col|
           label = @unique_labels[row]
-          label_frequency = labels.select { |l| l == label }.count
+          label_frequency = labels.count(label)
           probability_matrix[row][col] = Math.log((value.to_f + @k) / (label_frequency.to_f + (2.0 * @k)))
         end
@@ -77,7 +46,7 @@ module Lurn
       end
       def build_document_count_matrix(vectors, labels)
-        matrix = Array.new(@unique_labels.count) { Array.new(@feature_count) { 0 } }
+        matrix = Array.new(@unique_labels.count) { Array.new(@feature_count, 0) }
         vectors.each_with_index do |value, row, col|
           if value == true
@@ -90,16 +59,20 @@ module Lurn
         Matrix.rows(matrix)
       end
-      def joint_log_likelihood(vector, label)
-        label_index = @unique_labels.index(label)
+      def joint_log_likelihood(x)
+        jlls = []
-        vector = Vector.elements(vector.map { |e| e == true ? 1 : 0 })
-        probabilities = @probability_matrix.row(label_index)
-        neg_probs = probabilities.map { |prb| Math.log(1.0 - Math.exp(prb)) }
-        jll = vector.dot(probabilities - neg_probs)
-        jll += Math.log(@label_probabilities[label_index]) + neg_probs.sum
+        unique_labels.each_with_index do |label, label_index|
+          vector = Vector.elements(x.map { |e| e == true ? 1 : 0 })
+          probabilities = @probability_matrix.row(label_index)
+          neg_probs = probabilities.map { |prb| Math.log(1.0 - Math.exp(prb)) }
+          jll = vector.dot(probabilities - neg_probs)
+          jll += Math.log(@label_probabilities[label_index]) + neg_probs.inject(:+)
+          jlls.push jll
+        end
-        jll
+        jlls
       end
     end

data/lib/lurn/naive_bayes/multinomial_naive_bayes.rb ADDED

@@ -0,0 +1,65 @@
+module Lurn
+  module NaiveBayes
+    class MultinomialNaiveBayes < Base
+      attr_accessor :prior_probabilities, :probability_matrix, :unique_labels
+      def initialize
+      end
+      def fit(vectors, labels)
+        vectors = Matrix.rows(vectors)
+        @unique_labels = labels.uniq
+        @feature_count = vectors.column_size
+        count_matrix = build_count_matrix(vectors, labels)
+        @probability_matrix = build_probability_matrix(count_matrix, labels)
+        @prior_probabilities = @unique_labels.map do |l1|
+          labels.count { |l2| l1 == l2 }.to_f / labels.count.to_f
+        end
+      end
+      private
+      def build_probability_matrix(count_matrix, labels)
+        probability_matrix = Array.new(@unique_labels.count) { Array.new(@feature_count, 0.0) }
+        count_matrix.each_with_index do |value, row, col|
+          label = @unique_labels[row]
+          label_frequency = labels.count(label)
+          numerator = (value.to_f + 1.0)
+          denominator = count_matrix.row(row).inject(:+) + @feature_count
+          probability_matrix[row][col] = Math.log(numerator / denominator)
+        end
+        probability_matrix
+      end
+      def build_count_matrix(vectors, labels)
+        matrix = Array.new(@unique_labels.count) { Array.new(@feature_count, 0) }
+        vectors.each_with_index do |value, row, col|
+          label = labels[row]
+          label_index = @unique_labels.index(label)
+          matrix[label_index][col] += value
+        end
+        Matrix.rows(matrix)
+      end
+      def joint_log_likelihood(vector)
+        jlls = []
+        @unique_labels.each_with_index do |label, label_index|
+          probabilities = @probability_matrix[label_index]
+          jll = vector.dot(probabilities)
+          jll += Math.log(@prior_probabilities[label_index])
+          jlls.push(jll)
+        end
+        jlls
+      end
+    end
+  end
+end

data/lib/lurn/neighbors/knn_base.rb ADDED

@@ -0,0 +1,54 @@
+module Lurn
+  module Neighbors
+    class KNNBase
+      attr_accessor :predictors, :targets, :k
+      def initialize(k)
+        @k = k
+      end
+      # Trains the KNN regression model to predict the target variable
+      # based on the predictors. For KNN Regression all computation is
+      # deferred until the time of prediction so in this case the data
+      # is just stored.
+      #
+      # @param predictors [Array-like] An array of arrays containing the predictor data
+      # @param targets [Array-like] An array with the value you want to predict
+      def fit(predictors, targets)
+        @predictors = predictors.map { |pred| Vector.elements(pred) }
+        @targets = targets
+        nil
+      end
+      # Returns the predictors and target value for the k nearest neighbors for the vector parameter
+      #
+      # @param vector [Array-like] An array of the same length and type as the predictors used to train the model
+      # @return [Array, Array]
+      #   Returns two values. The first is an array of the predictors for the k nearest neighbors. The second is an
+      #   array of the corresponding target values for the k nearest neighbors.
+      def nearest_neighbors(vector)
+        vector = Vector.elements(vector)
+        distances = @predictors.map.with_index do |p, index|
+          { index: index, distance: euclidian_distance(p, vector), value: targets[index] }
+        end
+        distances.sort! { |x,y| x[:distance] <=> y[:distance] }
+        neighboring_predictors = distances.first(@k).map { |neighbor| @predictors[neighbor[:index]] }
+        neighboring_targets = distances.first(@k).map { |neighbor| @targets[neighbor[:index]] }
+        return neighboring_predictors, neighboring_targets
+      end
+      private
+      def euclidian_distance(vector1, vector2)
+        Math.sqrt((vector1 - vector2).map { |v| (v.abs)**2 }.inject(:+))
+      end
+    end
+  end
+end

data/lib/lurn/neighbors/knn_classifier.rb ADDED

@@ -0,0 +1,23 @@
+module Lurn
+  module Neighbors
+    class KNNClassifier < KNNBase
+      # Predicts the class of the given observation by selecting the most common class of the
+      # closest k training observations based on euclidian distance. In the case of a tie one winner
+      # will be chosen at random from the most frequent classes.
+      #
+      # @param vector [Array-like]
+      #   An array (or array-like) of the same length as the predictors used
+      #   to fit the model
+      # @return [Object] The predicted class
+      def predict(vector)
+        _, neighboring_targets = nearest_neighbors(vector)
+        class_frequencies = neighboring_targets.inject(Hash.new(0)) { |h,v| h[v] += 1; h }
+        neighboring_targets.max_by { |v| class_frequencies[v] }
+      end
+    end
+  end
+end

data/lib/lurn/neighbors/knn_regression.rb ADDED

@@ -0,0 +1,20 @@
+module Lurn
+  module Neighbors
+    class KNNRegression < KNNBase
+      # Predicts the value of the given observation by averaging the target value of the
+      # closest k predictor observations based on euclidian distance.
+      #
+      # @param vector [Array-like]
+      #   An array (or array-like) of the same length as the predictors used
+      #   to fit the model
+      # @return [Float] The predicted value
+      def predict(vector)
+        _, neighboring_targets = nearest_neighbors(vector)
+        neighboring_targets.inject(:+).to_f / neighboring_targets.length.to_f
+      end
+    end
+  end
+end

data/lib/lurn/text/bernoulli_vectorizer.rb CHANGED

@@ -17,7 +17,7 @@ module Lurn
       def fit(documents)
         @vocabulary = []
         tokenized_docs = tokenize_documents(documents)
-        @vocabulary = tokenized_docs.flatten.uniq.sort
+        @vocabulary = tokenized_docs.flatten(1).uniq.sort
         reduce_features(tokenized_docs)
       end
@@ -49,12 +49,9 @@ module Lurn
           end
         end
-        reduced_features = []
-        @vocabulary.each_with_index do |token, index|
+        reduced_features = @vocabulary.select.with_index do |token, index|
           freq = doc_frequencies[index]
-          if freq < @options[:max_df] && freq > @options[:min_df]
-            reduced_features.push token
-          end
+          @options[:min_df] < freq && freq < @options[:max_df]
         end
         @vocabulary = reduced_features

data/lib/lurn/text/word_count_vectorizer.rb ADDED

@@ -0,0 +1,65 @@
+module Lurn
+  module Text
+    class WordCountVectorizer
+      attr_accessor :tokenizer
+      attr_accessor :vocabulary
+      def initialize(options = {})
+        @tokenizer = options[:tokenizer] || WordTokenizer.new
+        @vocabulary = []
+        options[:max_df] ||= 50
+        options[:min_df] ||= 0
+        @options = options
+      end
+      def fit(documents)
+        @vocabulary = []
+        tokenized_docs = tokenize_documents(documents)
+        @vocabulary = tokenized_docs.flatten(1).uniq.sort
+        reduce_features(tokenized_docs)
+      end
+      def to_h
+        {
+          tokenizer_options: @tokenizer.to_h,
+          vocabulary: @vocabulary
+        }
+      end
+      def transform(documents)
+        documents.map do |document|
+          tokens = @tokenizer.tokenize(document)
+          @vocabulary.map do |word|
+            tokens.count word
+          end
+        end
+      end
+      private
+      def reduce_features(tokenized_docs)
+        doc_frequencies = Array.new(@vocabulary.length, 0)
+        tokenized_docs.each do |tokens|
+          tokens.each do |token|
+            vocab_index = @vocabulary.index(token)
+            doc_frequencies[vocab_index] += 1
+          end
+        end
+        reduced_features = @vocabulary.select.with_index do |token, index|
+          freq = doc_frequencies[index]
+          @options[:min_df] < freq && freq < @options[:max_df]
+        end
+        @vocabulary = reduced_features
+      end
+      def tokenize_documents(documents)
+        documents.map { |doc| @tokenizer.tokenize(doc).uniq }
+      end
+    end
+  end
+end

data/lib/lurn/text/word_tokenizer.rb CHANGED

@@ -21,17 +21,22 @@ module Lurn
         @options[:strip_punctuation] ||= false
         @options[:strip_stopwords] ||= false
         @options[:stem_words] ||= false
+        @options[:ngrams] ||= 1
       end
       def tokenize(document)
-        document = document.gsub(/[[:punct:]]/, '') if @options[:strip_punctuation] == true
-        document = document.gsub(/\s+/, ' ').split(" ")
+        document = document.gsub(/[[:punct:]]/, "") if @options[:strip_punctuation] == true
+        document = document.split("\s")
         if(@options[:stem_words])
           stemmer = Lingua::Stemmer.new(language: :en)
           document = document.map { |word| stemmer.stem(word) }
         end
+        if(@options[:ngrams] > 1)
+          document = document.each_cons(@options[:ngrams]).to_a
+        end
         document
       end

data/lurn.gemspec CHANGED

@@ -1,11 +1,7 @@
 # coding: utf-8
-lib = File.expand_path('../lib', __FILE__)
-$LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
-require 'lurn/version'
 Gem::Specification.new do |spec|
   spec.name          = "lurn"
-  spec.version       = Lurn::VERSION
+  spec.version       = "0.1.2"
   spec.authors       = ["daniel.carpenter"]
   spec.email         = ["daniel.carpenter01@gmail.com"]
@@ -21,13 +17,15 @@ Gem::Specification.new do |spec|
   spec.executables   = spec.files.grep(%r{^exe/}) { |f| File.basename(f) }
   spec.require_paths = ["lib"]
-  spec.add_dependency "terminal-table", "~> 1.7.3"
+  spec.add_dependency "terminal-table", "~> 1.8.0", '>= 1.8.0'
   spec.add_dependency "ruby-stemmer", "~> 0.9.6"
-  spec.add_dependency "daru", '~> 0.1.6'
+  spec.add_dependency "daru", "~> 0.2.1"
   spec.add_development_dependency "bundler", "~> 1.13"
   spec.add_development_dependency "rake", "~> 10.0"
   spec.add_development_dependency "rspec", "~> 3.0"
-  spec.add_development_dependency "awesome_print"
-  spec.add_development_dependency "byebug"
+  spec.add_development_dependency "awesome_print", "~> 0"
+  spec.add_development_dependency "byebug", "~> 10.0", ">= 10.0.2"
+  spec.add_development_dependency "rspec_junit_formatter", "~> 0.4", ">= 0.4.1"
+  spec.add_development_dependency "yard", "~> 0.9.9"
 end

data/readmes/evaluation/classifier_evaluator.md ADDED

@@ -0,0 +1,21 @@
+# Classifier Evaluator
+`Lurn::Evaluation::ClassifierEvaluator` provides some basic functionality for evaluating the performance of a classifier.
+## Example
+```
+actual_class = ['sports','science','science','sports']
+predicted_class = ['sports','sports','science','sports']
+eval = Lurn::Evaluation::ClassifierEvaluator.new predicted_class, actual_class
+print eval.summary
+# output
++-----------------+--------------------+--------+
+| Class           | Precision          | Recall |
++-----------------+--------------------+--------+
+| sports          | 0.6666666666666666 | 1.0    |
+| science         | 1.0                | 0.5    |
+| Overall Average | 0.8333333333333333 | 0.75   |
++-----------------+--------------------+--------+
+```

data/readmes/naive_bayes/bernoulli_naive_bayes.md ADDED

@@ -0,0 +1,41 @@
+### Bernoulli Naive Bayes
+Naive bayes is a bayesian model often used for text classification. Bernoulli Naive Bayes specifically classifies observations based on the presence or absence of a feature in an observation.
+Below is a simple text classification using Naive Bayes in Lurn.
+1. Start with some text documents
+  ```ruby
+  documents = [
+    'ruby is a great programming language',
+    'the giants recently won the world series',
+    'java is a compiled programming language',
+    'the jets are a football team'
+  ]
+  labels = ['computers','sports','computers','sports']
+  ```
+2. Convert them to arrays of booleans representing which words they contain (or don't contain). Lurn provides vectorizers for this purpose.
+  ```
+  vectorizer = Lurn::Text::BernoulliVectorizer.new
+  vectorizer.fit(documents)
+  vectors = vectorizer.transform(documents)
+  ```
+3. Initialize and train the model
+  ```
+  model = Lurn::NaiveBayes::BernoulliNaiveBayes.new
+  model.fit(vectors, labels)
+  ```
+4. Classify a new document
+  ```
+  new_vectors = vectorizer.transform(['programming is fun'])
+  # get the most probable class for the new document given the training data
+  model.max_class(new_vectors.first)
+  # get the probability score for the most probable class
+  model.max_probability(new_vectors.first)
+  ```

data/readmes/naive_bayes/multinomial_naive_bayes.md ADDED

@@ -0,0 +1,41 @@
+### Multinomial Naive Bayes
+Naive bayes is a bayesian model often used for text classification. Multinomial Naive Bayes specifically classifies observations based on variables with a multinomial distribution (a.k.a. numbers).
+Below is a simple text classification using Multinomial Naive Bayes in Lurn.
+1. Start with some text documents
+  ```ruby
+  documents = [
+    'ruby is a great programming language',
+    'the giants recently won the world series',
+    'java is a compiled programming language',
+    'the jets are a football team'
+  ]
+  labels = ['computers','sports','computers','sports']
+  ```
+2. Convert them to arrays of booleans representing which words they contain (or don't contain). Lurn provides vectorizers for this purpose.
+  ```
+  vectorizer = Lurn::Text::WordCountVectorizer.new
+  vectorizer.fit(documents)
+  vectors = vectorizer.transform(documents)
+  ```
+3. Initialize and train the model
+  ```
+  model = Lurn::NaiveBayes::MultinomialNaiveBayes.new
+  model.fit(vectors, labels)
+  ```
+4. Classify a new document
+  ```
+  new_vectors = vectorizer.transform(['programming is fun'])
+  # get the most probable class for the new document given the training data
+  model.max_class(new_vectors.first)
+  # get the probability score for the most probable class
+  model.max_probability(new_vectors.first)
+  ```

data/readmes/neighbors/knn_classification.md ADDED

@@ -0,0 +1,48 @@
+### K Nearest Neighbor Classifier
+K Nearest Neighbor (KNN) Classification is one of the simplest forms of classification
+in the machine learning toolbox. Training data is stored on the model and all
+computation is deferred until the time of prediction. When a new observation
+is provided it calculates the distance between the new observation and all
+training data in an n-dimensional space (where n is the number of variables).
+The predicted class is the most common class among the k closest training records.
+Below is a simple example of using KNN Classification in Lurn.
+Suppose we have a dataset containing the income, years of college eduction and job title
+for a set of people. We could use this as training data to predict
+people's job title based on their income and years of eduction.
+  ```ruby
+  people = [
+    # years of education  annual income job title
+    [ 4,                  50000,        'engineer'],
+    [ 6,                  60000,        'scientist'],
+    [ 2,                  40000,        'engineer'],
+    [ 8,                  90000,         'scientist'],
+    [ 4,                  70000,        'librarian'],
+  ]
+  # eduction and income
+  predictors = people.map { |person| person[0..1] }
+  # extract annual income
+  target_var = people.map { |person| person[2]}
+  ```
+The model can be trained by passing the predictors and target values to an initialized
+instance of the KNNClassifier model.
+```ruby
+  # initialize the model with a k of 2
+  model = Lurn::Neighbors::KNNClassifier.new(2)
+  model.fit(predictors, target_var)
+```
+The model can now be used to predict the income of a person given his/her
+age and years of education.
+```ruby
+  # predict the job title of person with 4 years of eduction who make $45,000
+  model.predict([4, 45000])  # => engineer
+```

data/readmes/neighbors/knn_regression.md ADDED

@@ -0,0 +1,48 @@
+### K Nearest Neighbor Regression
+K Nearest Neighbor (KNN) Regression is one of the simplest forms of regression
+in the machine learning toolbox. Training data is stored on the model and all
+computation is deferred until the time of prediction. When a new observation
+is provided it calculates the distance between the new observation and all
+training data in an n-dimensional space (where n is the number of variables).
+The predicted value is the average value of the k closest training records.
+Below is a simple example of using KNN Regression in Lurn.
+Suppose we have a dataset containing the age, years of college eduction and annual
+income for a set of people. We could use this as training data to predict
+people's annual income based on their age and years of eduction.
+  ```ruby
+  people = [
+    # age   years of education  annual income
+    [ 25,   4,                 50000],
+    [ 35,   6,                 60000],
+    [ 51,   2,                 40000],
+    [ 45,   8,                 90000],
+    [ 32,   4,                 70000],
+  ]
+  # extract age and eduction
+  predictors = people.map { |person| person[0..1] }
+  # extract annual income
+  target_var = people.map { |person| person[2]}
+  ```
+The model can be trained by passing the predictors and target values to an initialized
+instance of the KNNRegression model.
+```ruby
+  # initialize the model with a k of 2
+  model = Lurn::Neighbors::KNNRegression.new(2)
+  model.fit(predictors, target_var)
+```
+The model can now be used to predict the income of a person given his/her
+age and years of education.
+```ruby
+  # predict the income of a 31 year old person with 4 years of eduction
+  model.predict([31, 4])
+```

data/readmes/text_processing/bernoulli_vectorizer.md ADDED

@@ -0,0 +1,30 @@
+# Bernoulli Vectorizer (word presence vectorizer)
+A bernoulli document model is one that represents a piece of text as an array of boolean values. Each boolean represents the presence (true) or absence (false) of a word in the document.
+`Lurn::Text::BernoulliVectorizer` is intended to make it easy to convert text into Bernoulli vectors.
+## Basic example
+```
+docs = ['hello world', 'hello fred']
+vectorizer = Lurn::Text::BernoulliVectorizer.new
+# vectorizers must be trained in order to know
+# what features (words) exist in the data set
+vectorizer.fit(docs)
+vectorizer.transform(docs)
+```
+## Configuration
+The BernoulliVectorizer.new includes a number of options for configuring how documents are vectorized. A few include:
+- max_df[int]: Excludes words which appear in more than `max_df` documents
+- min_df[int]: Excludes words which appear in fewer than `min_df` documents
+- strip_stopwords[boolean]: Removes stop words if true
+- stem_words[boolean]: Stems words in the documents if true
+- ngrams[int]: Features will be determined based on groupings of `ngrams` consecutive words instead of individual words
+```
+Lurn::BernoulliVectorizer.new(strip_stopwords: true, min_df: 10)
+```

metadata CHANGED

@@ -1,14 +1,14 @@
 --- !ruby/object:Gem::Specification
 name: lurn
 version: !ruby/object:Gem::Version
-  version: 0.1.1
+  version: 0.1.2
 platform: ruby
 authors:
 - daniel.carpenter
 autorequire:
 bindir: exe
 cert_chain: []
-date: 2017-12-13 00:00:00.000000000 Z
+date: 2018-08-17 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   name: terminal-table
@@ -16,14 +16,20 @@ dependencies:
     requirements:
     - - "~>"
       - !ruby/object:Gem::Version
-        version: 1.7.3
+        version: 1.8.0
+    - - ">="
+      - !ruby/object:Gem::Version
+        version: 1.8.0
   type: :runtime
   prerelease: false
   version_requirements: !ruby/object:Gem::Requirement
     requirements:
     - - "~>"
       - !ruby/object:Gem::Version
-        version: 1.7.3
+        version: 1.8.0
+    - - ">="
+      - !ruby/object:Gem::Version
+        version: 1.8.0
 - !ruby/object:Gem::Dependency
   name: ruby-stemmer
   requirement: !ruby/object:Gem::Requirement
@@ -44,14 +50,14 @@ dependencies:
     requirements:
     - - "~>"
       - !ruby/object:Gem::Version
-        version: 0.1.6
+        version: 0.2.1
   type: :runtime
   prerelease: false
   version_requirements: !ruby/object:Gem::Requirement
     requirements:
     - - "~>"
       - !ruby/object:Gem::Version
-        version: 0.1.6
+        version: 0.2.1
 - !ruby/object:Gem::Dependency
   name: bundler
   requirement: !ruby/object:Gem::Requirement
@@ -98,30 +104,70 @@ dependencies:
   name: awesome_print
   requirement: !ruby/object:Gem::Requirement
     requirements:
-    - - ">="
+    - - "~>"
       - !ruby/object:Gem::Version
         version: '0'
   type: :development
   prerelease: false
   version_requirements: !ruby/object:Gem::Requirement
     requirements:
-    - - ">="
+    - - "~>"
       - !ruby/object:Gem::Version
         version: '0'
 - !ruby/object:Gem::Dependency
   name: byebug
   requirement: !ruby/object:Gem::Requirement
     requirements:
+    - - "~>"
+      - !ruby/object:Gem::Version
+        version: '10.0'
     - - ">="
       - !ruby/object:Gem::Version
-        version: '0'
+        version: 10.0.2
   type: :development
   prerelease: false
   version_requirements: !ruby/object:Gem::Requirement
     requirements:
+    - - "~>"
+      - !ruby/object:Gem::Version
+        version: '10.0'
     - - ">="
       - !ruby/object:Gem::Version
-        version: '0'
+        version: 10.0.2
+- !ruby/object:Gem::Dependency
+  name: rspec_junit_formatter
+  requirement: !ruby/object:Gem::Requirement
+    requirements:
+    - - "~>"
+      - !ruby/object:Gem::Version
+        version: '0.4'
+    - - ">="
+      - !ruby/object:Gem::Version
+        version: 0.4.1
+  type: :development
+  prerelease: false
+  version_requirements: !ruby/object:Gem::Requirement
+    requirements:
+    - - "~>"
+      - !ruby/object:Gem::Version
+        version: '0.4'
+    - - ">="
+      - !ruby/object:Gem::Version
+        version: 0.4.1
+- !ruby/object:Gem::Dependency
+  name: yard
+  requirement: !ruby/object:Gem::Requirement
+    requirements:
+    - - "~>"
+      - !ruby/object:Gem::Version
+        version: 0.9.9
+  type: :development
+  prerelease: false
+  version_requirements: !ruby/object:Gem::Requirement
+    requirements:
+    - - "~>"
+      - !ruby/object:Gem::Version
+        version: 0.9.9
 description: " A gem with tools for machine learning. "
 email:
 - daniel.carpenter01@gmail.com
@@ -129,6 +175,7 @@ executables: []
 extensions: []
 extra_rdoc_files: []
 files:
+- ".circleci/config.yml"
 - ".gitignore"
 - ".rspec"
 - ".travis.yml"
@@ -140,11 +187,22 @@ files:
 - bin/setup
 - lib/lurn.rb
 - lib/lurn/evaluation/classifier_evaluator.rb
+- lib/lurn/naive_bayes/base.rb
 - lib/lurn/naive_bayes/bernoulli_naive_bayes.rb
+- lib/lurn/naive_bayes/multinomial_naive_bayes.rb
+- lib/lurn/neighbors/knn_base.rb
+- lib/lurn/neighbors/knn_classifier.rb
+- lib/lurn/neighbors/knn_regression.rb
 - lib/lurn/text/bernoulli_vectorizer.rb
+- lib/lurn/text/word_count_vectorizer.rb
 - lib/lurn/text/word_tokenizer.rb
-- lib/lurn/version.rb
 - lurn.gemspec
+- readmes/evaluation/classifier_evaluator.md
+- readmes/naive_bayes/bernoulli_naive_bayes.md
+- readmes/naive_bayes/multinomial_naive_bayes.md
+- readmes/neighbors/knn_classification.md
+- readmes/neighbors/knn_regression.md
+- readmes/text_processing/bernoulli_vectorizer.md
 homepage: https://www.github.com/dansbits/lurn
 licenses:
 - MIT

data/lib/lurn/version.rb DELETED

@@ -1,3 +0,0 @@
-module Lurn
-  VERSION = "0.1.1"
-end