lurn 0.1.1 → 0.1.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: b628aa2df6567044144aebc12d52f284b0eb93e9
4
- data.tar.gz: 7d6089b8ca48eb371e39288ae034543b6f447e9d
3
+ metadata.gz: 74be6a1bd3e76e61d34048367f8fafed76c39a46
4
+ data.tar.gz: 355ea667da4dd95845d00d8ebebc40636d740cab
5
5
  SHA512:
6
- metadata.gz: 0140373cd80d2594c4c34e5c9959f043b1f33ff527b5dd5e8ebcba7173153ee2ed3766b2f49888780040dbbe1b04e6bfce656bf0cd0b21294a9c0e50898bd798
7
- data.tar.gz: dd31fabb232408c405fe7e40a630723fa39ab30f28982b8db6f0f6cf3dfeca4fc4bf4dbaae5df889e48c101617208993ee01701b1e2f34e0d550c499299d1789
6
+ metadata.gz: 9dad1d2540818efd226bb029aca818b19f5995a3bfbe77392e2577236038f7c84237cb4b19203e0101da1830338d135ea321e27cbecd32664851994c5a100035
7
+ data.tar.gz: 4defd5fc70dcfbd3389ab184cf59764cf734bc8bda9347070e0c448cb7286ca7d1cd17ca926785191501ca030a4f8f3c74639fa4271a3f6d13cabe88d2a012d0
@@ -0,0 +1,46 @@
1
+ # Ruby CircleCI 2.0 configuration file
2
+ #
3
+ # Check https://circleci.com/docs/2.0/language-ruby/ for more details
4
+ #
5
+ version: 2
6
+ jobs:
7
+ build:
8
+ docker:
9
+ # specify the version you desire here
10
+ - image: circleci/ruby:2.4
11
+
12
+ # Specify service dependencies here if necessary
13
+ # CircleCI maintains a library of pre-built images
14
+ # documented at https://circleci.com/docs/2.0/circleci-images/
15
+ # - image: circleci/postgres:9.4
16
+
17
+ working_directory: ~/repo
18
+
19
+ steps:
20
+ - checkout
21
+
22
+ - run:
23
+ name: install dependencies
24
+ command: |
25
+ bundle install --jobs=4 --retry=3 --path vendor/bundle
26
+
27
+ # run tests!
28
+ - run:
29
+ name: run tests
30
+ command: |
31
+ mkdir /tmp/test-results
32
+ TEST_FILES="$(circleci tests glob "spec/**/*_spec.rb" | circleci tests split --split-by=timings)"
33
+
34
+ bundle exec rspec --format progress \
35
+ --format RspecJunitFormatter \
36
+ --out /tmp/test-results/rspec.xml \
37
+ --format progress \
38
+ -- \
39
+ $TEST_FILES
40
+
41
+ # collect reports
42
+ - store_test_results:
43
+ path: /tmp/test-results
44
+ - store_artifacts:
45
+ path: /tmp/test-results
46
+ destination: test-results
data/README.md CHANGED
@@ -1,6 +1,6 @@
1
1
  # Lurn
2
2
 
3
- Lurn is a ruby gem for performing machine learning. The API and design patterns in Lurn are inspired by sklearn, an analogous library for Python.
3
+ Lurn is a ruby gem for performing machine learning tasks. The API and design patterns in Lurn are inspired by scikit-learn, a popular machine learning library for Python.
4
4
 
5
5
  ## Installation
6
6
 
@@ -20,37 +20,16 @@ Or install it yourself as:
20
20
 
21
21
  ## Usage
22
22
 
23
- ### Bernoulli Naive Bayes
24
- ```ruby
25
- require 'lurn'
26
-
27
- documents = [
28
- 'ruby is a great programming language',
29
- 'the giants recently won the world series',
30
- 'java is a compiled programming language',
31
- 'the jets are a football team'
32
- ]
33
-
34
- labels = ['computers','sports','computers','sports']
35
-
36
- # vectorizers take raw data and transform it to a set of features that our
37
- # model can understand - in this case an array of boolean values representing
38
- # the presence or absence of a word in text
39
- vectorizer = Lurn::Text::BernoulliVectorizer.new
40
- vectorizer.fit(documents)
41
- vectors = vectorizer.transform(documents)
42
-
43
- model = Lurn::NaiveBayes::BernoulliNaiveBayes.new
44
- model.fit(vectors, labels)
45
-
46
- new_vectors = vectorizer.transform(['programming is fun'])
47
- probabilities = model.predict_probabilities(new_vectors.first)
48
- # => [0.9715681919147049, 0.028431808085295614]
49
-
50
- # to get the class of the maximum probability, look at the same index of the
51
- # unique_labels attribute on the model
52
- model.unique_labels[0] # => 'computers'
53
- ```
23
+ - Naive Bayes
24
+ - [Bernoulli Naive Bayes](readmes/naive_bayes/bernoulli_naive_bayes.md)
25
+ - [Multinomial Naive Bayes](readmes/naive_bayes/multinomial_naive_bayes.md)
26
+ - Nearest Neighbor Models
27
+ - [K Nearest Neighbor Regression](readmes/neighbors/knn_regression.md)
28
+ - [K Nearest Neighbor Classification](readmes/neighbors/knn_classification.md)
29
+ - Text Processing
30
+ - [Bernoulli Vectorizer](readmes/text_processing/bernoulli_vectorizer.md)
31
+ - Model Evaluation
32
+ - [ClassifierEvaluator](readmes/evaluation/classifier_evaluator.md)
54
33
 
55
34
  ## Development
56
35
 
@@ -1,10 +1,14 @@
1
1
  require "daru"
2
- require "lurn/version"
3
2
  require "lurn/text/word_tokenizer"
4
3
  require "lurn/text/bernoulli_vectorizer"
4
+ require "lurn/text/word_count_vectorizer"
5
+ require "lurn/naive_bayes/base"
5
6
  require "lurn/naive_bayes/bernoulli_naive_bayes"
7
+ require "lurn/naive_bayes/multinomial_naive_bayes"
6
8
  require "lurn/evaluation/classifier_evaluator"
9
+ require "lurn/neighbors/knn_base"
10
+ require "lurn/neighbors/knn_regression"
11
+ require "lurn/neighbors/knn_classifier"
7
12
 
8
13
  module Lurn
9
- # Your code goes here...
10
14
  end
@@ -0,0 +1,32 @@
1
+ module Lurn
2
+ module NaiveBayes
3
+ class Base
4
+ def predict_probabilities(vector)
5
+ log_probabilties = predict_log_probabilities(vector)
6
+
7
+ log_probabilties.map { |p| Math.exp(p) }
8
+ end
9
+
10
+ def max_class(vector)
11
+ log_probs = predict_log_probabilities(vector)
12
+
13
+ max_index = log_probs.index(log_probs.max)
14
+
15
+ unique_labels[max_index]
16
+ end
17
+
18
+ def max_probability(vector)
19
+ probs = predict_probabilities(vector)
20
+
21
+ probs.max
22
+ end
23
+
24
+ def predict_log_probabilities(vector)
25
+ vector = Vector.elements(vector)
26
+ jll = joint_log_likelihood(vector)
27
+ log_prob_x = Math.log(jll.map { |v| Math.exp(v) }.inject(:+))
28
+ jll.map{ |v| v - log_prob_x }
29
+ end
30
+ end
31
+ end
32
+ end
@@ -2,7 +2,7 @@ require 'matrix'
2
2
 
3
3
  module Lurn
4
4
  module NaiveBayes
5
- class BernoulliNaiveBayes
5
+ class BernoulliNaiveBayes < Base
6
6
 
7
7
  attr_accessor :probability_matrix, :label_probabilities, :unique_labels
8
8
 
@@ -19,38 +19,7 @@ module Lurn
19
19
  document_count_matrix = build_document_count_matrix(vectors, labels)
20
20
  @probability_matrix = build_probability_matrix(document_count_matrix, labels)
21
21
 
22
- @label_probabilities = @unique_labels.map { |l1| labels.select { |l2| l1 == l2 }.count.to_f / labels.count.to_f }
23
- end
24
-
25
- def predict_probabilities(vector)
26
- log_probabilties = predict_log_probabilities(vector)
27
-
28
- log_probabilties.map { |p| Math.exp(p) }
29
- end
30
-
31
- def predict_log_probabilities(vector)
32
-
33
- probabilities = @unique_labels.map do |label|
34
- joint_log_likelihood(vector, label)
35
- end
36
-
37
- log_prob_x = Math.log(probabilities.map { |v| Math.exp(v) }.sum)
38
-
39
- probabilities.map { |p| p - log_prob_x }
40
- end
41
-
42
- def max_class(vector)
43
- log_probs = predict_log_probabilities(vector)
44
-
45
- max_index = log_probs.index(log_probs.max)
46
-
47
- unique_labels[max_index]
48
- end
49
-
50
- def max_probability(vector)
51
- probs = predict_probabilities(vector)
52
-
53
- probs.max
22
+ @label_probabilities = @unique_labels.map { |l1| labels.count { |l2| l1 == l2 }.to_f / labels.count.to_f }
54
23
  end
55
24
 
56
25
  def to_h
@@ -64,11 +33,11 @@ module Lurn
64
33
  private
65
34
 
66
35
  def build_probability_matrix(document_count_matrix, labels)
67
- probability_matrix = Array.new(@unique_labels.count) { Array.new(@feature_count) { 0.0 } }
36
+ probability_matrix = Array.new(@unique_labels.count) { Array.new(@feature_count, 0.0) }
68
37
 
69
38
  document_count_matrix.each_with_index do |value, row, col|
70
39
  label = @unique_labels[row]
71
- label_frequency = labels.select { |l| l == label }.count
40
+ label_frequency = labels.count(label)
72
41
 
73
42
  probability_matrix[row][col] = Math.log((value.to_f + @k) / (label_frequency.to_f + (2.0 * @k)))
74
43
  end
@@ -77,7 +46,7 @@ module Lurn
77
46
  end
78
47
 
79
48
  def build_document_count_matrix(vectors, labels)
80
- matrix = Array.new(@unique_labels.count) { Array.new(@feature_count) { 0 } }
49
+ matrix = Array.new(@unique_labels.count) { Array.new(@feature_count, 0) }
81
50
 
82
51
  vectors.each_with_index do |value, row, col|
83
52
  if value == true
@@ -90,16 +59,20 @@ module Lurn
90
59
  Matrix.rows(matrix)
91
60
  end
92
61
 
93
- def joint_log_likelihood(vector, label)
94
- label_index = @unique_labels.index(label)
62
+ def joint_log_likelihood(x)
63
+ jlls = []
95
64
 
96
- vector = Vector.elements(vector.map { |e| e == true ? 1 : 0 })
97
- probabilities = @probability_matrix.row(label_index)
98
- neg_probs = probabilities.map { |prb| Math.log(1.0 - Math.exp(prb)) }
99
- jll = vector.dot(probabilities - neg_probs)
100
- jll += Math.log(@label_probabilities[label_index]) + neg_probs.sum
65
+ unique_labels.each_with_index do |label, label_index|
66
+ vector = Vector.elements(x.map { |e| e == true ? 1 : 0 })
67
+ probabilities = @probability_matrix.row(label_index)
68
+ neg_probs = probabilities.map { |prb| Math.log(1.0 - Math.exp(prb)) }
69
+ jll = vector.dot(probabilities - neg_probs)
70
+ jll += Math.log(@label_probabilities[label_index]) + neg_probs.inject(:+)
71
+
72
+ jlls.push jll
73
+ end
101
74
 
102
- jll
75
+ jlls
103
76
  end
104
77
 
105
78
  end
@@ -0,0 +1,65 @@
1
+ module Lurn
2
+ module NaiveBayes
3
+ class MultinomialNaiveBayes < Base
4
+
5
+ attr_accessor :prior_probabilities, :probability_matrix, :unique_labels
6
+
7
+ def initialize
8
+
9
+ end
10
+
11
+ def fit(vectors, labels)
12
+ vectors = Matrix.rows(vectors)
13
+
14
+ @unique_labels = labels.uniq
15
+ @feature_count = vectors.column_size
16
+ count_matrix = build_count_matrix(vectors, labels)
17
+ @probability_matrix = build_probability_matrix(count_matrix, labels)
18
+ @prior_probabilities = @unique_labels.map do |l1|
19
+ labels.count { |l2| l1 == l2 }.to_f / labels.count.to_f
20
+ end
21
+ end
22
+
23
+ private
24
+
25
+ def build_probability_matrix(count_matrix, labels)
26
+ probability_matrix = Array.new(@unique_labels.count) { Array.new(@feature_count, 0.0) }
27
+
28
+ count_matrix.each_with_index do |value, row, col|
29
+ label = @unique_labels[row]
30
+ label_frequency = labels.count(label)
31
+
32
+ numerator = (value.to_f + 1.0)
33
+ denominator = count_matrix.row(row).inject(:+) + @feature_count
34
+ probability_matrix[row][col] = Math.log(numerator / denominator)
35
+ end
36
+
37
+ probability_matrix
38
+ end
39
+
40
+ def build_count_matrix(vectors, labels)
41
+ matrix = Array.new(@unique_labels.count) { Array.new(@feature_count, 0) }
42
+
43
+ vectors.each_with_index do |value, row, col|
44
+ label = labels[row]
45
+ label_index = @unique_labels.index(label)
46
+ matrix[label_index][col] += value
47
+ end
48
+
49
+ Matrix.rows(matrix)
50
+ end
51
+
52
+ def joint_log_likelihood(vector)
53
+ jlls = []
54
+ @unique_labels.each_with_index do |label, label_index|
55
+ probabilities = @probability_matrix[label_index]
56
+ jll = vector.dot(probabilities)
57
+ jll += Math.log(@prior_probabilities[label_index])
58
+ jlls.push(jll)
59
+ end
60
+
61
+ jlls
62
+ end
63
+ end
64
+ end
65
+ end
@@ -0,0 +1,54 @@
1
+ module Lurn
2
+ module Neighbors
3
+ class KNNBase
4
+
5
+ attr_accessor :predictors, :targets, :k
6
+
7
+ def initialize(k)
8
+ @k = k
9
+ end
10
+
11
+ # Trains the KNN regression model to predict the target variable
12
+ # based on the predictors. For KNN Regression all computation is
13
+ # deferred until the time of prediction so in this case the data
14
+ # is just stored.
15
+ #
16
+ # @param predictors [Array-like] An array of arrays containing the predictor data
17
+ # @param targets [Array-like] An array with the value you want to predict
18
+ def fit(predictors, targets)
19
+ @predictors = predictors.map { |pred| Vector.elements(pred) }
20
+ @targets = targets
21
+
22
+ nil
23
+ end
24
+
25
+ # Returns the predictors and target value for the k nearest neighbors for the vector parameter
26
+ #
27
+ # @param vector [Array-like] An array of the same length and type as the predictors used to train the model
28
+ # @return [Array, Array]
29
+ # Returns two values. The first is an array of the predictors for the k nearest neighbors. The second is an
30
+ # array of the corresponding target values for the k nearest neighbors.
31
+ def nearest_neighbors(vector)
32
+ vector = Vector.elements(vector)
33
+
34
+ distances = @predictors.map.with_index do |p, index|
35
+ { index: index, distance: euclidian_distance(p, vector), value: targets[index] }
36
+ end
37
+
38
+ distances.sort! { |x,y| x[:distance] <=> y[:distance] }
39
+
40
+ neighboring_predictors = distances.first(@k).map { |neighbor| @predictors[neighbor[:index]] }
41
+ neighboring_targets = distances.first(@k).map { |neighbor| @targets[neighbor[:index]] }
42
+
43
+ return neighboring_predictors, neighboring_targets
44
+ end
45
+
46
+ private
47
+
48
+ def euclidian_distance(vector1, vector2)
49
+ Math.sqrt((vector1 - vector2).map { |v| (v.abs)**2 }.inject(:+))
50
+ end
51
+
52
+ end
53
+ end
54
+ end
@@ -0,0 +1,23 @@
1
+ module Lurn
2
+ module Neighbors
3
+ class KNNClassifier < KNNBase
4
+
5
+ # Predicts the class of the given observation by selecting the most common class of the
6
+ # closest k training observations based on euclidian distance. In the case of a tie one winner
7
+ # will be chosen at random from the most frequent classes.
8
+ #
9
+ # @param vector [Array-like]
10
+ # An array (or array-like) of the same length as the predictors used
11
+ # to fit the model
12
+ # @return [Object] The predicted class
13
+ def predict(vector)
14
+ _, neighboring_targets = nearest_neighbors(vector)
15
+
16
+ class_frequencies = neighboring_targets.inject(Hash.new(0)) { |h,v| h[v] += 1; h }
17
+
18
+ neighboring_targets.max_by { |v| class_frequencies[v] }
19
+ end
20
+
21
+ end
22
+ end
23
+ end
@@ -0,0 +1,20 @@
1
+ module Lurn
2
+ module Neighbors
3
+ class KNNRegression < KNNBase
4
+
5
+ # Predicts the value of the given observation by averaging the target value of the
6
+ # closest k predictor observations based on euclidian distance.
7
+ #
8
+ # @param vector [Array-like]
9
+ # An array (or array-like) of the same length as the predictors used
10
+ # to fit the model
11
+ # @return [Float] The predicted value
12
+ def predict(vector)
13
+ _, neighboring_targets = nearest_neighbors(vector)
14
+
15
+ neighboring_targets.inject(:+).to_f / neighboring_targets.length.to_f
16
+ end
17
+
18
+ end
19
+ end
20
+ end
@@ -17,7 +17,7 @@ module Lurn
17
17
  def fit(documents)
18
18
  @vocabulary = []
19
19
  tokenized_docs = tokenize_documents(documents)
20
- @vocabulary = tokenized_docs.flatten.uniq.sort
20
+ @vocabulary = tokenized_docs.flatten(1).uniq.sort
21
21
  reduce_features(tokenized_docs)
22
22
  end
23
23
 
@@ -49,12 +49,9 @@ module Lurn
49
49
  end
50
50
  end
51
51
 
52
- reduced_features = []
53
- @vocabulary.each_with_index do |token, index|
52
+ reduced_features = @vocabulary.select.with_index do |token, index|
54
53
  freq = doc_frequencies[index]
55
- if freq < @options[:max_df] && freq > @options[:min_df]
56
- reduced_features.push token
57
- end
54
+ @options[:min_df] < freq && freq < @options[:max_df]
58
55
  end
59
56
 
60
57
  @vocabulary = reduced_features
@@ -0,0 +1,65 @@
1
+ module Lurn
2
+ module Text
3
+ class WordCountVectorizer
4
+
5
+ attr_accessor :tokenizer
6
+ attr_accessor :vocabulary
7
+
8
+ def initialize(options = {})
9
+ @tokenizer = options[:tokenizer] || WordTokenizer.new
10
+ @vocabulary = []
11
+
12
+ options[:max_df] ||= 50
13
+ options[:min_df] ||= 0
14
+ @options = options
15
+ end
16
+
17
+ def fit(documents)
18
+ @vocabulary = []
19
+ tokenized_docs = tokenize_documents(documents)
20
+ @vocabulary = tokenized_docs.flatten(1).uniq.sort
21
+ reduce_features(tokenized_docs)
22
+ end
23
+
24
+ def to_h
25
+ {
26
+ tokenizer_options: @tokenizer.to_h,
27
+ vocabulary: @vocabulary
28
+ }
29
+ end
30
+
31
+ def transform(documents)
32
+ documents.map do |document|
33
+ tokens = @tokenizer.tokenize(document)
34
+ @vocabulary.map do |word|
35
+ tokens.count word
36
+ end
37
+ end
38
+ end
39
+
40
+ private
41
+
42
+ def reduce_features(tokenized_docs)
43
+ doc_frequencies = Array.new(@vocabulary.length, 0)
44
+
45
+ tokenized_docs.each do |tokens|
46
+ tokens.each do |token|
47
+ vocab_index = @vocabulary.index(token)
48
+ doc_frequencies[vocab_index] += 1
49
+ end
50
+ end
51
+
52
+ reduced_features = @vocabulary.select.with_index do |token, index|
53
+ freq = doc_frequencies[index]
54
+ @options[:min_df] < freq && freq < @options[:max_df]
55
+ end
56
+
57
+ @vocabulary = reduced_features
58
+ end
59
+
60
+ def tokenize_documents(documents)
61
+ documents.map { |doc| @tokenizer.tokenize(doc).uniq }
62
+ end
63
+ end
64
+ end
65
+ end
@@ -21,17 +21,22 @@ module Lurn
21
21
  @options[:strip_punctuation] ||= false
22
22
  @options[:strip_stopwords] ||= false
23
23
  @options[:stem_words] ||= false
24
+ @options[:ngrams] ||= 1
24
25
  end
25
26
 
26
27
  def tokenize(document)
27
- document = document.gsub(/[[:punct:]]/, '') if @options[:strip_punctuation] == true
28
- document = document.gsub(/\s+/, ' ').split(" ")
28
+ document = document.gsub(/[[:punct:]]/, "") if @options[:strip_punctuation] == true
29
+ document = document.split("\s")
29
30
 
30
31
  if(@options[:stem_words])
31
32
  stemmer = Lingua::Stemmer.new(language: :en)
32
33
  document = document.map { |word| stemmer.stem(word) }
33
34
  end
34
35
 
36
+ if(@options[:ngrams] > 1)
37
+ document = document.each_cons(@options[:ngrams]).to_a
38
+ end
39
+
35
40
  document
36
41
  end
37
42
 
@@ -1,11 +1,7 @@
1
1
  # coding: utf-8
2
- lib = File.expand_path('../lib', __FILE__)
3
- $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
4
- require 'lurn/version'
5
-
6
2
  Gem::Specification.new do |spec|
7
3
  spec.name = "lurn"
8
- spec.version = Lurn::VERSION
4
+ spec.version = "0.1.2"
9
5
  spec.authors = ["daniel.carpenter"]
10
6
  spec.email = ["daniel.carpenter01@gmail.com"]
11
7
 
@@ -21,13 +17,15 @@ Gem::Specification.new do |spec|
21
17
  spec.executables = spec.files.grep(%r{^exe/}) { |f| File.basename(f) }
22
18
  spec.require_paths = ["lib"]
23
19
 
24
- spec.add_dependency "terminal-table", "~> 1.7.3"
20
+ spec.add_dependency "terminal-table", "~> 1.8.0", '>= 1.8.0'
25
21
  spec.add_dependency "ruby-stemmer", "~> 0.9.6"
26
- spec.add_dependency "daru", '~> 0.1.6'
22
+ spec.add_dependency "daru", "~> 0.2.1"
27
23
 
28
24
  spec.add_development_dependency "bundler", "~> 1.13"
29
25
  spec.add_development_dependency "rake", "~> 10.0"
30
26
  spec.add_development_dependency "rspec", "~> 3.0"
31
- spec.add_development_dependency "awesome_print"
32
- spec.add_development_dependency "byebug"
27
+ spec.add_development_dependency "awesome_print", "~> 0"
28
+ spec.add_development_dependency "byebug", "~> 10.0", ">= 10.0.2"
29
+ spec.add_development_dependency "rspec_junit_formatter", "~> 0.4", ">= 0.4.1"
30
+ spec.add_development_dependency "yard", "~> 0.9.9"
33
31
  end
@@ -0,0 +1,21 @@
1
+ # Classifier Evaluator
2
+ `Lurn::Evaluation::ClassifierEvaluator` provides some basic functionality for evaluating the performance of a classifier.
3
+
4
+ ## Example
5
+ ```
6
+ actual_class = ['sports','science','science','sports']
7
+ predicted_class = ['sports','sports','science','sports']
8
+
9
+ eval = Lurn::Evaluation::ClassifierEvaluator.new predicted_class, actual_class
10
+
11
+ print eval.summary
12
+
13
+ # output
14
+ +-----------------+--------------------+--------+
15
+ | Class | Precision | Recall |
16
+ +-----------------+--------------------+--------+
17
+ | sports | 0.6666666666666666 | 1.0 |
18
+ | science | 1.0 | 0.5 |
19
+ | Overall Average | 0.8333333333333333 | 0.75 |
20
+ +-----------------+--------------------+--------+
21
+ ```
@@ -0,0 +1,41 @@
1
+ ### Bernoulli Naive Bayes
2
+ Naive bayes is a bayesian model often used for text classification. Bernoulli Naive Bayes specifically classifies observations based on the presence or absence of a feature in an observation.
3
+
4
+ Below is a simple text classification using Naive Bayes in Lurn.
5
+
6
+ 1. Start with some text documents
7
+
8
+ ```ruby
9
+ documents = [
10
+ 'ruby is a great programming language',
11
+ 'the giants recently won the world series',
12
+ 'java is a compiled programming language',
13
+ 'the jets are a football team'
14
+ ]
15
+
16
+ labels = ['computers','sports','computers','sports']
17
+ ```
18
+
19
+ 2. Convert them to arrays of booleans representing which words they contain (or don't contain). Lurn provides vectorizers for this purpose.
20
+ ```
21
+ vectorizer = Lurn::Text::BernoulliVectorizer.new
22
+ vectorizer.fit(documents)
23
+ vectors = vectorizer.transform(documents)
24
+ ```
25
+
26
+ 3. Initialize and train the model
27
+ ```
28
+ model = Lurn::NaiveBayes::BernoulliNaiveBayes.new
29
+ model.fit(vectors, labels)
30
+ ```
31
+
32
+ 4. Classify a new document
33
+ ```
34
+ new_vectors = vectorizer.transform(['programming is fun'])
35
+
36
+ # get the most probable class for the new document given the training data
37
+ model.max_class(new_vectors.first)
38
+
39
+ # get the probability score for the most probable class
40
+ model.max_probability(new_vectors.first)
41
+ ```
@@ -0,0 +1,41 @@
1
+ ### Multinomial Naive Bayes
2
+ Naive bayes is a bayesian model often used for text classification. Multinomial Naive Bayes specifically classifies observations based on variables with a multinomial distribution (a.k.a. numbers).
3
+
4
+ Below is a simple text classification using Multinomial Naive Bayes in Lurn.
5
+
6
+ 1. Start with some text documents
7
+
8
+ ```ruby
9
+ documents = [
10
+ 'ruby is a great programming language',
11
+ 'the giants recently won the world series',
12
+ 'java is a compiled programming language',
13
+ 'the jets are a football team'
14
+ ]
15
+
16
+ labels = ['computers','sports','computers','sports']
17
+ ```
18
+
19
+ 2. Convert them to arrays of booleans representing which words they contain (or don't contain). Lurn provides vectorizers for this purpose.
20
+ ```
21
+ vectorizer = Lurn::Text::WordCountVectorizer.new
22
+ vectorizer.fit(documents)
23
+ vectors = vectorizer.transform(documents)
24
+ ```
25
+
26
+ 3. Initialize and train the model
27
+ ```
28
+ model = Lurn::NaiveBayes::MultinomialNaiveBayes.new
29
+ model.fit(vectors, labels)
30
+ ```
31
+
32
+ 4. Classify a new document
33
+ ```
34
+ new_vectors = vectorizer.transform(['programming is fun'])
35
+
36
+ # get the most probable class for the new document given the training data
37
+ model.max_class(new_vectors.first)
38
+
39
+ # get the probability score for the most probable class
40
+ model.max_probability(new_vectors.first)
41
+ ```
@@ -0,0 +1,48 @@
1
+ ### K Nearest Neighbor Classifier
2
+ K Nearest Neighbor (KNN) Classification is one of the simplest forms of classification
3
+ in the machine learning toolbox. Training data is stored on the model and all
4
+ computation is deferred until the time of prediction. When a new observation
5
+ is provided it calculates the distance between the new observation and all
6
+ training data in an n-dimensional space (where n is the number of variables).
7
+ The predicted class is the most common class among the k closest training records.
8
+
9
+ Below is a simple example of using KNN Classification in Lurn.
10
+
11
+ Suppose we have a dataset containing the income, years of college eduction and job title
12
+ for a set of people. We could use this as training data to predict
13
+ people's job title based on their income and years of eduction.
14
+
15
+ ```ruby
16
+ people = [
17
+ # years of education annual income job title
18
+ [ 4, 50000, 'engineer'],
19
+ [ 6, 60000, 'scientist'],
20
+ [ 2, 40000, 'engineer'],
21
+ [ 8, 90000, 'scientist'],
22
+ [ 4, 70000, 'librarian'],
23
+ ]
24
+
25
+ # eduction and income
26
+ predictors = people.map { |person| person[0..1] }
27
+
28
+ # extract annual income
29
+ target_var = people.map { |person| person[2]}
30
+ ```
31
+
32
+ The model can be trained by passing the predictors and target values to an initialized
33
+ instance of the KNNClassifier model.
34
+
35
+ ```ruby
36
+ # initialize the model with a k of 2
37
+ model = Lurn::Neighbors::KNNClassifier.new(2)
38
+
39
+ model.fit(predictors, target_var)
40
+ ```
41
+
42
+ The model can now be used to predict the income of a person given his/her
43
+ age and years of education.
44
+
45
+ ```ruby
46
+ # predict the job title of person with 4 years of eduction who make $45,000
47
+ model.predict([4, 45000]) # => engineer
48
+ ```
@@ -0,0 +1,48 @@
1
+ ### K Nearest Neighbor Regression
2
+ K Nearest Neighbor (KNN) Regression is one of the simplest forms of regression
3
+ in the machine learning toolbox. Training data is stored on the model and all
4
+ computation is deferred until the time of prediction. When a new observation
5
+ is provided it calculates the distance between the new observation and all
6
+ training data in an n-dimensional space (where n is the number of variables).
7
+ The predicted value is the average value of the k closest training records.
8
+
9
+ Below is a simple example of using KNN Regression in Lurn.
10
+
11
+ Suppose we have a dataset containing the age, years of college eduction and annual
12
+ income for a set of people. We could use this as training data to predict
13
+ people's annual income based on their age and years of eduction.
14
+
15
+ ```ruby
16
+ people = [
17
+ # age years of education annual income
18
+ [ 25, 4, 50000],
19
+ [ 35, 6, 60000],
20
+ [ 51, 2, 40000],
21
+ [ 45, 8, 90000],
22
+ [ 32, 4, 70000],
23
+ ]
24
+
25
+ # extract age and eduction
26
+ predictors = people.map { |person| person[0..1] }
27
+
28
+ # extract annual income
29
+ target_var = people.map { |person| person[2]}
30
+ ```
31
+
32
+ The model can be trained by passing the predictors and target values to an initialized
33
+ instance of the KNNRegression model.
34
+
35
+ ```ruby
36
+ # initialize the model with a k of 2
37
+ model = Lurn::Neighbors::KNNRegression.new(2)
38
+
39
+ model.fit(predictors, target_var)
40
+ ```
41
+
42
+ The model can now be used to predict the income of a person given his/her
43
+ age and years of education.
44
+
45
+ ```ruby
46
+ # predict the income of a 31 year old person with 4 years of eduction
47
+ model.predict([31, 4])
48
+ ```
@@ -0,0 +1,30 @@
1
+ # Bernoulli Vectorizer (word presence vectorizer)
2
+
3
+ A bernoulli document model is one that represents a piece of text as an array of boolean values. Each boolean represents the presence (true) or absence (false) of a word in the document.
4
+
5
+ `Lurn::Text::BernoulliVectorizer` is intended to make it easy to convert text into Bernoulli vectors.
6
+
7
+ ## Basic example
8
+ ```
9
+ docs = ['hello world', 'hello fred']
10
+
11
+ vectorizer = Lurn::Text::BernoulliVectorizer.new
12
+
13
+ # vectorizers must be trained in order to know
14
+ # what features (words) exist in the data set
15
+ vectorizer.fit(docs)
16
+
17
+ vectorizer.transform(docs)
18
+ ```
19
+
20
+ ## Configuration
21
+ The BernoulliVectorizer.new includes a number of options for configuring how documents are vectorized. A few include:
22
+ - max_df[int]: Excludes words which appear in more than `max_df` documents
23
+ - min_df[int]: Excludes words which appear in fewer than `min_df` documents
24
+ - strip_stopwords[boolean]: Removes stop words if true
25
+ - stem_words[boolean]: Stems words in the documents if true
26
+ - ngrams[int]: Features will be determined based on groupings of `ngrams` consecutive words instead of individual words
27
+
28
+ ```
29
+ Lurn::BernoulliVectorizer.new(strip_stopwords: true, min_df: 10)
30
+ ```
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: lurn
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.1
4
+ version: 0.1.2
5
5
  platform: ruby
6
6
  authors:
7
7
  - daniel.carpenter
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2017-12-13 00:00:00.000000000 Z
11
+ date: 2018-08-17 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: terminal-table
@@ -16,14 +16,20 @@ dependencies:
16
16
  requirements:
17
17
  - - "~>"
18
18
  - !ruby/object:Gem::Version
19
- version: 1.7.3
19
+ version: 1.8.0
20
+ - - ">="
21
+ - !ruby/object:Gem::Version
22
+ version: 1.8.0
20
23
  type: :runtime
21
24
  prerelease: false
22
25
  version_requirements: !ruby/object:Gem::Requirement
23
26
  requirements:
24
27
  - - "~>"
25
28
  - !ruby/object:Gem::Version
26
- version: 1.7.3
29
+ version: 1.8.0
30
+ - - ">="
31
+ - !ruby/object:Gem::Version
32
+ version: 1.8.0
27
33
  - !ruby/object:Gem::Dependency
28
34
  name: ruby-stemmer
29
35
  requirement: !ruby/object:Gem::Requirement
@@ -44,14 +50,14 @@ dependencies:
44
50
  requirements:
45
51
  - - "~>"
46
52
  - !ruby/object:Gem::Version
47
- version: 0.1.6
53
+ version: 0.2.1
48
54
  type: :runtime
49
55
  prerelease: false
50
56
  version_requirements: !ruby/object:Gem::Requirement
51
57
  requirements:
52
58
  - - "~>"
53
59
  - !ruby/object:Gem::Version
54
- version: 0.1.6
60
+ version: 0.2.1
55
61
  - !ruby/object:Gem::Dependency
56
62
  name: bundler
57
63
  requirement: !ruby/object:Gem::Requirement
@@ -98,30 +104,70 @@ dependencies:
98
104
  name: awesome_print
99
105
  requirement: !ruby/object:Gem::Requirement
100
106
  requirements:
101
- - - ">="
107
+ - - "~>"
102
108
  - !ruby/object:Gem::Version
103
109
  version: '0'
104
110
  type: :development
105
111
  prerelease: false
106
112
  version_requirements: !ruby/object:Gem::Requirement
107
113
  requirements:
108
- - - ">="
114
+ - - "~>"
109
115
  - !ruby/object:Gem::Version
110
116
  version: '0'
111
117
  - !ruby/object:Gem::Dependency
112
118
  name: byebug
113
119
  requirement: !ruby/object:Gem::Requirement
114
120
  requirements:
121
+ - - "~>"
122
+ - !ruby/object:Gem::Version
123
+ version: '10.0'
115
124
  - - ">="
116
125
  - !ruby/object:Gem::Version
117
- version: '0'
126
+ version: 10.0.2
118
127
  type: :development
119
128
  prerelease: false
120
129
  version_requirements: !ruby/object:Gem::Requirement
121
130
  requirements:
131
+ - - "~>"
132
+ - !ruby/object:Gem::Version
133
+ version: '10.0'
122
134
  - - ">="
123
135
  - !ruby/object:Gem::Version
124
- version: '0'
136
+ version: 10.0.2
137
+ - !ruby/object:Gem::Dependency
138
+ name: rspec_junit_formatter
139
+ requirement: !ruby/object:Gem::Requirement
140
+ requirements:
141
+ - - "~>"
142
+ - !ruby/object:Gem::Version
143
+ version: '0.4'
144
+ - - ">="
145
+ - !ruby/object:Gem::Version
146
+ version: 0.4.1
147
+ type: :development
148
+ prerelease: false
149
+ version_requirements: !ruby/object:Gem::Requirement
150
+ requirements:
151
+ - - "~>"
152
+ - !ruby/object:Gem::Version
153
+ version: '0.4'
154
+ - - ">="
155
+ - !ruby/object:Gem::Version
156
+ version: 0.4.1
157
+ - !ruby/object:Gem::Dependency
158
+ name: yard
159
+ requirement: !ruby/object:Gem::Requirement
160
+ requirements:
161
+ - - "~>"
162
+ - !ruby/object:Gem::Version
163
+ version: 0.9.9
164
+ type: :development
165
+ prerelease: false
166
+ version_requirements: !ruby/object:Gem::Requirement
167
+ requirements:
168
+ - - "~>"
169
+ - !ruby/object:Gem::Version
170
+ version: 0.9.9
125
171
  description: " A gem with tools for machine learning. "
126
172
  email:
127
173
  - daniel.carpenter01@gmail.com
@@ -129,6 +175,7 @@ executables: []
129
175
  extensions: []
130
176
  extra_rdoc_files: []
131
177
  files:
178
+ - ".circleci/config.yml"
132
179
  - ".gitignore"
133
180
  - ".rspec"
134
181
  - ".travis.yml"
@@ -140,11 +187,22 @@ files:
140
187
  - bin/setup
141
188
  - lib/lurn.rb
142
189
  - lib/lurn/evaluation/classifier_evaluator.rb
190
+ - lib/lurn/naive_bayes/base.rb
143
191
  - lib/lurn/naive_bayes/bernoulli_naive_bayes.rb
192
+ - lib/lurn/naive_bayes/multinomial_naive_bayes.rb
193
+ - lib/lurn/neighbors/knn_base.rb
194
+ - lib/lurn/neighbors/knn_classifier.rb
195
+ - lib/lurn/neighbors/knn_regression.rb
144
196
  - lib/lurn/text/bernoulli_vectorizer.rb
197
+ - lib/lurn/text/word_count_vectorizer.rb
145
198
  - lib/lurn/text/word_tokenizer.rb
146
- - lib/lurn/version.rb
147
199
  - lurn.gemspec
200
+ - readmes/evaluation/classifier_evaluator.md
201
+ - readmes/naive_bayes/bernoulli_naive_bayes.md
202
+ - readmes/naive_bayes/multinomial_naive_bayes.md
203
+ - readmes/neighbors/knn_classification.md
204
+ - readmes/neighbors/knn_regression.md
205
+ - readmes/text_processing/bernoulli_vectorizer.md
148
206
  homepage: https://www.github.com/dansbits/lurn
149
207
  licenses:
150
208
  - MIT
@@ -1,3 +0,0 @@
1
- module Lurn
2
- VERSION = "0.1.1"
3
- end