lurn 0.1.1 → 0.1.2

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: b628aa2df6567044144aebc12d52f284b0eb93e9
4
- data.tar.gz: 7d6089b8ca48eb371e39288ae034543b6f447e9d
3
+ metadata.gz: 74be6a1bd3e76e61d34048367f8fafed76c39a46
4
+ data.tar.gz: 355ea667da4dd95845d00d8ebebc40636d740cab
5
5
  SHA512:
6
- metadata.gz: 0140373cd80d2594c4c34e5c9959f043b1f33ff527b5dd5e8ebcba7173153ee2ed3766b2f49888780040dbbe1b04e6bfce656bf0cd0b21294a9c0e50898bd798
7
- data.tar.gz: dd31fabb232408c405fe7e40a630723fa39ab30f28982b8db6f0f6cf3dfeca4fc4bf4dbaae5df889e48c101617208993ee01701b1e2f34e0d550c499299d1789
6
+ metadata.gz: 9dad1d2540818efd226bb029aca818b19f5995a3bfbe77392e2577236038f7c84237cb4b19203e0101da1830338d135ea321e27cbecd32664851994c5a100035
7
+ data.tar.gz: 4defd5fc70dcfbd3389ab184cf59764cf734bc8bda9347070e0c448cb7286ca7d1cd17ca926785191501ca030a4f8f3c74639fa4271a3f6d13cabe88d2a012d0
@@ -0,0 +1,46 @@
1
+ # Ruby CircleCI 2.0 configuration file
2
+ #
3
+ # Check https://circleci.com/docs/2.0/language-ruby/ for more details
4
+ #
5
+ version: 2
6
+ jobs:
7
+ build:
8
+ docker:
9
+ # specify the version you desire here
10
+ - image: circleci/ruby:2.4
11
+
12
+ # Specify service dependencies here if necessary
13
+ # CircleCI maintains a library of pre-built images
14
+ # documented at https://circleci.com/docs/2.0/circleci-images/
15
+ # - image: circleci/postgres:9.4
16
+
17
+ working_directory: ~/repo
18
+
19
+ steps:
20
+ - checkout
21
+
22
+ - run:
23
+ name: install dependencies
24
+ command: |
25
+ bundle install --jobs=4 --retry=3 --path vendor/bundle
26
+
27
+ # run tests!
28
+ - run:
29
+ name: run tests
30
+ command: |
31
+ mkdir /tmp/test-results
32
+ TEST_FILES="$(circleci tests glob "spec/**/*_spec.rb" | circleci tests split --split-by=timings)"
33
+
34
+ bundle exec rspec --format progress \
35
+ --format RspecJunitFormatter \
36
+ --out /tmp/test-results/rspec.xml \
37
+ --format progress \
38
+ -- \
39
+ $TEST_FILES
40
+
41
+ # collect reports
42
+ - store_test_results:
43
+ path: /tmp/test-results
44
+ - store_artifacts:
45
+ path: /tmp/test-results
46
+ destination: test-results
data/README.md CHANGED
@@ -1,6 +1,6 @@
1
1
  # Lurn
2
2
 
3
- Lurn is a ruby gem for performing machine learning. The API and design patterns in Lurn are inspired by sklearn, an analogous library for Python.
3
+ Lurn is a ruby gem for performing machine learning tasks. The API and design patterns in Lurn are inspired by scikit-learn, a popular machine learning library for Python.
4
4
 
5
5
  ## Installation
6
6
 
@@ -20,37 +20,16 @@ Or install it yourself as:
20
20
 
21
21
  ## Usage
22
22
 
23
- ### Bernoulli Naive Bayes
24
- ```ruby
25
- require 'lurn'
26
-
27
- documents = [
28
- 'ruby is a great programming language',
29
- 'the giants recently won the world series',
30
- 'java is a compiled programming language',
31
- 'the jets are a football team'
32
- ]
33
-
34
- labels = ['computers','sports','computers','sports']
35
-
36
- # vectorizers take raw data and transform it to a set of features that our
37
- # model can understand - in this case an array of boolean values representing
38
- # the presence or absence of a word in text
39
- vectorizer = Lurn::Text::BernoulliVectorizer.new
40
- vectorizer.fit(documents)
41
- vectors = vectorizer.transform(documents)
42
-
43
- model = Lurn::NaiveBayes::BernoulliNaiveBayes.new
44
- model.fit(vectors, labels)
45
-
46
- new_vectors = vectorizer.transform(['programming is fun'])
47
- probabilities = model.predict_probabilities(new_vectors.first)
48
- # => [0.9715681919147049, 0.028431808085295614]
49
-
50
- # to get the class of the maximum probability, look at the same index of the
51
- # unique_labels attribute on the model
52
- model.unique_labels[0] # => 'computers'
53
- ```
23
+ - Naive Bayes
24
+ - [Bernoulli Naive Bayes](readmes/naive_bayes/bernoulli_naive_bayes.md)
25
+ - [Multinomial Naive Bayes](readmes/naive_bayes/multinomial_naive_bayes.md)
26
+ - Nearest Neighbor Models
27
+ - [K Nearest Neighbor Regression](readmes/neighbors/knn_regression.md)
28
+ - [K Nearest Neighbor Classification](readmes/neighbors/knn_classification.md)
29
+ - Text Processing
30
+ - [Bernoulli Vectorizer](readmes/text_processing/bernoulli_vectorizer.md)
31
+ - Model Evaluation
32
+ - [ClassifierEvaluator](readmes/evaluation/classifier_evaluator.md)
54
33
 
55
34
  ## Development
56
35
 
@@ -1,10 +1,14 @@
1
1
  require "daru"
2
- require "lurn/version"
3
2
  require "lurn/text/word_tokenizer"
4
3
  require "lurn/text/bernoulli_vectorizer"
4
+ require "lurn/text/word_count_vectorizer"
5
+ require "lurn/naive_bayes/base"
5
6
  require "lurn/naive_bayes/bernoulli_naive_bayes"
7
+ require "lurn/naive_bayes/multinomial_naive_bayes"
6
8
  require "lurn/evaluation/classifier_evaluator"
9
+ require "lurn/neighbors/knn_base"
10
+ require "lurn/neighbors/knn_regression"
11
+ require "lurn/neighbors/knn_classifier"
7
12
 
8
13
  module Lurn
9
- # Your code goes here...
10
14
  end
@@ -0,0 +1,32 @@
1
+ module Lurn
2
+ module NaiveBayes
3
+ class Base
4
+ def predict_probabilities(vector)
5
+ log_probabilties = predict_log_probabilities(vector)
6
+
7
+ log_probabilties.map { |p| Math.exp(p) }
8
+ end
9
+
10
+ def max_class(vector)
11
+ log_probs = predict_log_probabilities(vector)
12
+
13
+ max_index = log_probs.index(log_probs.max)
14
+
15
+ unique_labels[max_index]
16
+ end
17
+
18
+ def max_probability(vector)
19
+ probs = predict_probabilities(vector)
20
+
21
+ probs.max
22
+ end
23
+
24
+ def predict_log_probabilities(vector)
25
+ vector = Vector.elements(vector)
26
+ jll = joint_log_likelihood(vector)
27
+ log_prob_x = Math.log(jll.map { |v| Math.exp(v) }.inject(:+))
28
+ jll.map{ |v| v - log_prob_x }
29
+ end
30
+ end
31
+ end
32
+ end
@@ -2,7 +2,7 @@ require 'matrix'
2
2
 
3
3
  module Lurn
4
4
  module NaiveBayes
5
- class BernoulliNaiveBayes
5
+ class BernoulliNaiveBayes < Base
6
6
 
7
7
  attr_accessor :probability_matrix, :label_probabilities, :unique_labels
8
8
 
@@ -19,38 +19,7 @@ module Lurn
19
19
  document_count_matrix = build_document_count_matrix(vectors, labels)
20
20
  @probability_matrix = build_probability_matrix(document_count_matrix, labels)
21
21
 
22
- @label_probabilities = @unique_labels.map { |l1| labels.select { |l2| l1 == l2 }.count.to_f / labels.count.to_f }
23
- end
24
-
25
- def predict_probabilities(vector)
26
- log_probabilties = predict_log_probabilities(vector)
27
-
28
- log_probabilties.map { |p| Math.exp(p) }
29
- end
30
-
31
- def predict_log_probabilities(vector)
32
-
33
- probabilities = @unique_labels.map do |label|
34
- joint_log_likelihood(vector, label)
35
- end
36
-
37
- log_prob_x = Math.log(probabilities.map { |v| Math.exp(v) }.sum)
38
-
39
- probabilities.map { |p| p - log_prob_x }
40
- end
41
-
42
- def max_class(vector)
43
- log_probs = predict_log_probabilities(vector)
44
-
45
- max_index = log_probs.index(log_probs.max)
46
-
47
- unique_labels[max_index]
48
- end
49
-
50
- def max_probability(vector)
51
- probs = predict_probabilities(vector)
52
-
53
- probs.max
22
+ @label_probabilities = @unique_labels.map { |l1| labels.count { |l2| l1 == l2 }.to_f / labels.count.to_f }
54
23
  end
55
24
 
56
25
  def to_h
@@ -64,11 +33,11 @@ module Lurn
64
33
  private
65
34
 
66
35
  def build_probability_matrix(document_count_matrix, labels)
67
- probability_matrix = Array.new(@unique_labels.count) { Array.new(@feature_count) { 0.0 } }
36
+ probability_matrix = Array.new(@unique_labels.count) { Array.new(@feature_count, 0.0) }
68
37
 
69
38
  document_count_matrix.each_with_index do |value, row, col|
70
39
  label = @unique_labels[row]
71
- label_frequency = labels.select { |l| l == label }.count
40
+ label_frequency = labels.count(label)
72
41
 
73
42
  probability_matrix[row][col] = Math.log((value.to_f + @k) / (label_frequency.to_f + (2.0 * @k)))
74
43
  end
@@ -77,7 +46,7 @@ module Lurn
77
46
  end
78
47
 
79
48
  def build_document_count_matrix(vectors, labels)
80
- matrix = Array.new(@unique_labels.count) { Array.new(@feature_count) { 0 } }
49
+ matrix = Array.new(@unique_labels.count) { Array.new(@feature_count, 0) }
81
50
 
82
51
  vectors.each_with_index do |value, row, col|
83
52
  if value == true
@@ -90,16 +59,20 @@ module Lurn
90
59
  Matrix.rows(matrix)
91
60
  end
92
61
 
93
- def joint_log_likelihood(vector, label)
94
- label_index = @unique_labels.index(label)
62
+ def joint_log_likelihood(x)
63
+ jlls = []
95
64
 
96
- vector = Vector.elements(vector.map { |e| e == true ? 1 : 0 })
97
- probabilities = @probability_matrix.row(label_index)
98
- neg_probs = probabilities.map { |prb| Math.log(1.0 - Math.exp(prb)) }
99
- jll = vector.dot(probabilities - neg_probs)
100
- jll += Math.log(@label_probabilities[label_index]) + neg_probs.sum
65
+ unique_labels.each_with_index do |label, label_index|
66
+ vector = Vector.elements(x.map { |e| e == true ? 1 : 0 })
67
+ probabilities = @probability_matrix.row(label_index)
68
+ neg_probs = probabilities.map { |prb| Math.log(1.0 - Math.exp(prb)) }
69
+ jll = vector.dot(probabilities - neg_probs)
70
+ jll += Math.log(@label_probabilities[label_index]) + neg_probs.inject(:+)
71
+
72
+ jlls.push jll
73
+ end
101
74
 
102
- jll
75
+ jlls
103
76
  end
104
77
 
105
78
  end
@@ -0,0 +1,65 @@
1
+ module Lurn
2
+ module NaiveBayes
3
+ class MultinomialNaiveBayes < Base
4
+
5
+ attr_accessor :prior_probabilities, :probability_matrix, :unique_labels
6
+
7
+ def initialize
8
+
9
+ end
10
+
11
+ def fit(vectors, labels)
12
+ vectors = Matrix.rows(vectors)
13
+
14
+ @unique_labels = labels.uniq
15
+ @feature_count = vectors.column_size
16
+ count_matrix = build_count_matrix(vectors, labels)
17
+ @probability_matrix = build_probability_matrix(count_matrix, labels)
18
+ @prior_probabilities = @unique_labels.map do |l1|
19
+ labels.count { |l2| l1 == l2 }.to_f / labels.count.to_f
20
+ end
21
+ end
22
+
23
+ private
24
+
25
+ def build_probability_matrix(count_matrix, labels)
26
+ probability_matrix = Array.new(@unique_labels.count) { Array.new(@feature_count, 0.0) }
27
+
28
+ count_matrix.each_with_index do |value, row, col|
29
+ label = @unique_labels[row]
30
+ label_frequency = labels.count(label)
31
+
32
+ numerator = (value.to_f + 1.0)
33
+ denominator = count_matrix.row(row).inject(:+) + @feature_count
34
+ probability_matrix[row][col] = Math.log(numerator / denominator)
35
+ end
36
+
37
+ probability_matrix
38
+ end
39
+
40
+ def build_count_matrix(vectors, labels)
41
+ matrix = Array.new(@unique_labels.count) { Array.new(@feature_count, 0) }
42
+
43
+ vectors.each_with_index do |value, row, col|
44
+ label = labels[row]
45
+ label_index = @unique_labels.index(label)
46
+ matrix[label_index][col] += value
47
+ end
48
+
49
+ Matrix.rows(matrix)
50
+ end
51
+
52
+ def joint_log_likelihood(vector)
53
+ jlls = []
54
+ @unique_labels.each_with_index do |label, label_index|
55
+ probabilities = @probability_matrix[label_index]
56
+ jll = vector.dot(probabilities)
57
+ jll += Math.log(@prior_probabilities[label_index])
58
+ jlls.push(jll)
59
+ end
60
+
61
+ jlls
62
+ end
63
+ end
64
+ end
65
+ end
@@ -0,0 +1,54 @@
1
+ module Lurn
2
+ module Neighbors
3
+ class KNNBase
4
+
5
+ attr_accessor :predictors, :targets, :k
6
+
7
+ def initialize(k)
8
+ @k = k
9
+ end
10
+
11
+ # Trains the KNN regression model to predict the target variable
12
+ # based on the predictors. For KNN Regression all computation is
13
+ # deferred until the time of prediction so in this case the data
14
+ # is just stored.
15
+ #
16
+ # @param predictors [Array-like] An array of arrays containing the predictor data
17
+ # @param targets [Array-like] An array with the value you want to predict
18
+ def fit(predictors, targets)
19
+ @predictors = predictors.map { |pred| Vector.elements(pred) }
20
+ @targets = targets
21
+
22
+ nil
23
+ end
24
+
25
+ # Returns the predictors and target value for the k nearest neighbors for the vector parameter
26
+ #
27
+ # @param vector [Array-like] An array of the same length and type as the predictors used to train the model
28
+ # @return [Array, Array]
29
+ # Returns two values. The first is an array of the predictors for the k nearest neighbors. The second is an
30
+ # array of the corresponding target values for the k nearest neighbors.
31
+ def nearest_neighbors(vector)
32
+ vector = Vector.elements(vector)
33
+
34
+ distances = @predictors.map.with_index do |p, index|
35
+ { index: index, distance: euclidian_distance(p, vector), value: targets[index] }
36
+ end
37
+
38
+ distances.sort! { |x,y| x[:distance] <=> y[:distance] }
39
+
40
+ neighboring_predictors = distances.first(@k).map { |neighbor| @predictors[neighbor[:index]] }
41
+ neighboring_targets = distances.first(@k).map { |neighbor| @targets[neighbor[:index]] }
42
+
43
+ return neighboring_predictors, neighboring_targets
44
+ end
45
+
46
+ private
47
+
48
+ def euclidian_distance(vector1, vector2)
49
+ Math.sqrt((vector1 - vector2).map { |v| (v.abs)**2 }.inject(:+))
50
+ end
51
+
52
+ end
53
+ end
54
+ end
@@ -0,0 +1,23 @@
1
+ module Lurn
2
+ module Neighbors
3
+ class KNNClassifier < KNNBase
4
+
5
+ # Predicts the class of the given observation by selecting the most common class of the
6
+ # closest k training observations based on euclidian distance. In the case of a tie one winner
7
+ # will be chosen at random from the most frequent classes.
8
+ #
9
+ # @param vector [Array-like]
10
+ # An array (or array-like) of the same length as the predictors used
11
+ # to fit the model
12
+ # @return [Object] The predicted class
13
+ def predict(vector)
14
+ _, neighboring_targets = nearest_neighbors(vector)
15
+
16
+ class_frequencies = neighboring_targets.inject(Hash.new(0)) { |h,v| h[v] += 1; h }
17
+
18
+ neighboring_targets.max_by { |v| class_frequencies[v] }
19
+ end
20
+
21
+ end
22
+ end
23
+ end
@@ -0,0 +1,20 @@
1
+ module Lurn
2
+ module Neighbors
3
+ class KNNRegression < KNNBase
4
+
5
+ # Predicts the value of the given observation by averaging the target value of the
6
+ # closest k predictor observations based on euclidian distance.
7
+ #
8
+ # @param vector [Array-like]
9
+ # An array (or array-like) of the same length as the predictors used
10
+ # to fit the model
11
+ # @return [Float] The predicted value
12
+ def predict(vector)
13
+ _, neighboring_targets = nearest_neighbors(vector)
14
+
15
+ neighboring_targets.inject(:+).to_f / neighboring_targets.length.to_f
16
+ end
17
+
18
+ end
19
+ end
20
+ end
@@ -17,7 +17,7 @@ module Lurn
17
17
  def fit(documents)
18
18
  @vocabulary = []
19
19
  tokenized_docs = tokenize_documents(documents)
20
- @vocabulary = tokenized_docs.flatten.uniq.sort
20
+ @vocabulary = tokenized_docs.flatten(1).uniq.sort
21
21
  reduce_features(tokenized_docs)
22
22
  end
23
23
 
@@ -49,12 +49,9 @@ module Lurn
49
49
  end
50
50
  end
51
51
 
52
- reduced_features = []
53
- @vocabulary.each_with_index do |token, index|
52
+ reduced_features = @vocabulary.select.with_index do |token, index|
54
53
  freq = doc_frequencies[index]
55
- if freq < @options[:max_df] && freq > @options[:min_df]
56
- reduced_features.push token
57
- end
54
+ @options[:min_df] < freq && freq < @options[:max_df]
58
55
  end
59
56
 
60
57
  @vocabulary = reduced_features
@@ -0,0 +1,65 @@
1
+ module Lurn
2
+ module Text
3
+ class WordCountVectorizer
4
+
5
+ attr_accessor :tokenizer
6
+ attr_accessor :vocabulary
7
+
8
+ def initialize(options = {})
9
+ @tokenizer = options[:tokenizer] || WordTokenizer.new
10
+ @vocabulary = []
11
+
12
+ options[:max_df] ||= 50
13
+ options[:min_df] ||= 0
14
+ @options = options
15
+ end
16
+
17
+ def fit(documents)
18
+ @vocabulary = []
19
+ tokenized_docs = tokenize_documents(documents)
20
+ @vocabulary = tokenized_docs.flatten(1).uniq.sort
21
+ reduce_features(tokenized_docs)
22
+ end
23
+
24
+ def to_h
25
+ {
26
+ tokenizer_options: @tokenizer.to_h,
27
+ vocabulary: @vocabulary
28
+ }
29
+ end
30
+
31
+ def transform(documents)
32
+ documents.map do |document|
33
+ tokens = @tokenizer.tokenize(document)
34
+ @vocabulary.map do |word|
35
+ tokens.count word
36
+ end
37
+ end
38
+ end
39
+
40
+ private
41
+
42
+ def reduce_features(tokenized_docs)
43
+ doc_frequencies = Array.new(@vocabulary.length, 0)
44
+
45
+ tokenized_docs.each do |tokens|
46
+ tokens.each do |token|
47
+ vocab_index = @vocabulary.index(token)
48
+ doc_frequencies[vocab_index] += 1
49
+ end
50
+ end
51
+
52
+ reduced_features = @vocabulary.select.with_index do |token, index|
53
+ freq = doc_frequencies[index]
54
+ @options[:min_df] < freq && freq < @options[:max_df]
55
+ end
56
+
57
+ @vocabulary = reduced_features
58
+ end
59
+
60
+ def tokenize_documents(documents)
61
+ documents.map { |doc| @tokenizer.tokenize(doc).uniq }
62
+ end
63
+ end
64
+ end
65
+ end
@@ -21,17 +21,22 @@ module Lurn
21
21
  @options[:strip_punctuation] ||= false
22
22
  @options[:strip_stopwords] ||= false
23
23
  @options[:stem_words] ||= false
24
+ @options[:ngrams] ||= 1
24
25
  end
25
26
 
26
27
  def tokenize(document)
27
- document = document.gsub(/[[:punct:]]/, '') if @options[:strip_punctuation] == true
28
- document = document.gsub(/\s+/, ' ').split(" ")
28
+ document = document.gsub(/[[:punct:]]/, "") if @options[:strip_punctuation] == true
29
+ document = document.split("\s")
29
30
 
30
31
  if(@options[:stem_words])
31
32
  stemmer = Lingua::Stemmer.new(language: :en)
32
33
  document = document.map { |word| stemmer.stem(word) }
33
34
  end
34
35
 
36
+ if(@options[:ngrams] > 1)
37
+ document = document.each_cons(@options[:ngrams]).to_a
38
+ end
39
+
35
40
  document
36
41
  end
37
42
 
@@ -1,11 +1,7 @@
1
1
  # coding: utf-8
2
- lib = File.expand_path('../lib', __FILE__)
3
- $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
4
- require 'lurn/version'
5
-
6
2
  Gem::Specification.new do |spec|
7
3
  spec.name = "lurn"
8
- spec.version = Lurn::VERSION
4
+ spec.version = "0.1.2"
9
5
  spec.authors = ["daniel.carpenter"]
10
6
  spec.email = ["daniel.carpenter01@gmail.com"]
11
7
 
@@ -21,13 +17,15 @@ Gem::Specification.new do |spec|
21
17
  spec.executables = spec.files.grep(%r{^exe/}) { |f| File.basename(f) }
22
18
  spec.require_paths = ["lib"]
23
19
 
24
- spec.add_dependency "terminal-table", "~> 1.7.3"
20
+ spec.add_dependency "terminal-table", "~> 1.8.0", '>= 1.8.0'
25
21
  spec.add_dependency "ruby-stemmer", "~> 0.9.6"
26
- spec.add_dependency "daru", '~> 0.1.6'
22
+ spec.add_dependency "daru", "~> 0.2.1"
27
23
 
28
24
  spec.add_development_dependency "bundler", "~> 1.13"
29
25
  spec.add_development_dependency "rake", "~> 10.0"
30
26
  spec.add_development_dependency "rspec", "~> 3.0"
31
- spec.add_development_dependency "awesome_print"
32
- spec.add_development_dependency "byebug"
27
+ spec.add_development_dependency "awesome_print", "~> 0"
28
+ spec.add_development_dependency "byebug", "~> 10.0", ">= 10.0.2"
29
+ spec.add_development_dependency "rspec_junit_formatter", "~> 0.4", ">= 0.4.1"
30
+ spec.add_development_dependency "yard", "~> 0.9.9"
33
31
  end
@@ -0,0 +1,21 @@
1
+ # Classifier Evaluator
2
+ `Lurn::Evaluation::ClassifierEvaluator` provides some basic functionality for evaluating the performance of a classifier.
3
+
4
+ ## Example
5
+ ```
6
+ actual_class = ['sports','science','science','sports']
7
+ predicted_class = ['sports','sports','science','sports']
8
+
9
+ eval = Lurn::Evaluation::ClassifierEvaluator.new predicted_class, actual_class
10
+
11
+ print eval.summary
12
+
13
+ # output
14
+ +-----------------+--------------------+--------+
15
+ | Class | Precision | Recall |
16
+ +-----------------+--------------------+--------+
17
+ | sports | 0.6666666666666666 | 1.0 |
18
+ | science | 1.0 | 0.5 |
19
+ | Overall Average | 0.8333333333333333 | 0.75 |
20
+ +-----------------+--------------------+--------+
21
+ ```
@@ -0,0 +1,41 @@
1
+ ### Bernoulli Naive Bayes
2
+ Naive bayes is a bayesian model often used for text classification. Bernoulli Naive Bayes specifically classifies observations based on the presence or absence of a feature in an observation.
3
+
4
+ Below is a simple text classification using Naive Bayes in Lurn.
5
+
6
+ 1. Start with some text documents
7
+
8
+ ```ruby
9
+ documents = [
10
+ 'ruby is a great programming language',
11
+ 'the giants recently won the world series',
12
+ 'java is a compiled programming language',
13
+ 'the jets are a football team'
14
+ ]
15
+
16
+ labels = ['computers','sports','computers','sports']
17
+ ```
18
+
19
+ 2. Convert them to arrays of booleans representing which words they contain (or don't contain). Lurn provides vectorizers for this purpose.
20
+ ```
21
+ vectorizer = Lurn::Text::BernoulliVectorizer.new
22
+ vectorizer.fit(documents)
23
+ vectors = vectorizer.transform(documents)
24
+ ```
25
+
26
+ 3. Initialize and train the model
27
+ ```
28
+ model = Lurn::NaiveBayes::BernoulliNaiveBayes.new
29
+ model.fit(vectors, labels)
30
+ ```
31
+
32
+ 4. Classify a new document
33
+ ```
34
+ new_vectors = vectorizer.transform(['programming is fun'])
35
+
36
+ # get the most probable class for the new document given the training data
37
+ model.max_class(new_vectors.first)
38
+
39
+ # get the probability score for the most probable class
40
+ model.max_probability(new_vectors.first)
41
+ ```
@@ -0,0 +1,41 @@
1
+ ### Multinomial Naive Bayes
2
+ Naive bayes is a bayesian model often used for text classification. Multinomial Naive Bayes specifically classifies observations based on variables with a multinomial distribution (a.k.a. numbers).
3
+
4
+ Below is a simple text classification using Multinomial Naive Bayes in Lurn.
5
+
6
+ 1. Start with some text documents
7
+
8
+ ```ruby
9
+ documents = [
10
+ 'ruby is a great programming language',
11
+ 'the giants recently won the world series',
12
+ 'java is a compiled programming language',
13
+ 'the jets are a football team'
14
+ ]
15
+
16
+ labels = ['computers','sports','computers','sports']
17
+ ```
18
+
19
+ 2. Convert them to arrays of booleans representing which words they contain (or don't contain). Lurn provides vectorizers for this purpose.
20
+ ```
21
+ vectorizer = Lurn::Text::WordCountVectorizer.new
22
+ vectorizer.fit(documents)
23
+ vectors = vectorizer.transform(documents)
24
+ ```
25
+
26
+ 3. Initialize and train the model
27
+ ```
28
+ model = Lurn::NaiveBayes::MultinomialNaiveBayes.new
29
+ model.fit(vectors, labels)
30
+ ```
31
+
32
+ 4. Classify a new document
33
+ ```
34
+ new_vectors = vectorizer.transform(['programming is fun'])
35
+
36
+ # get the most probable class for the new document given the training data
37
+ model.max_class(new_vectors.first)
38
+
39
+ # get the probability score for the most probable class
40
+ model.max_probability(new_vectors.first)
41
+ ```
@@ -0,0 +1,48 @@
1
+ ### K Nearest Neighbor Classifier
2
+ K Nearest Neighbor (KNN) Classification is one of the simplest forms of classification
3
+ in the machine learning toolbox. Training data is stored on the model and all
4
+ computation is deferred until the time of prediction. When a new observation
5
+ is provided it calculates the distance between the new observation and all
6
+ training data in an n-dimensional space (where n is the number of variables).
7
+ The predicted class is the most common class among the k closest training records.
8
+
9
+ Below is a simple example of using KNN Classification in Lurn.
10
+
11
+ Suppose we have a dataset containing the income, years of college eduction and job title
12
+ for a set of people. We could use this as training data to predict
13
+ people's job title based on their income and years of eduction.
14
+
15
+ ```ruby
16
+ people = [
17
+ # years of education annual income job title
18
+ [ 4, 50000, 'engineer'],
19
+ [ 6, 60000, 'scientist'],
20
+ [ 2, 40000, 'engineer'],
21
+ [ 8, 90000, 'scientist'],
22
+ [ 4, 70000, 'librarian'],
23
+ ]
24
+
25
+ # eduction and income
26
+ predictors = people.map { |person| person[0..1] }
27
+
28
+ # extract annual income
29
+ target_var = people.map { |person| person[2]}
30
+ ```
31
+
32
+ The model can be trained by passing the predictors and target values to an initialized
33
+ instance of the KNNClassifier model.
34
+
35
+ ```ruby
36
+ # initialize the model with a k of 2
37
+ model = Lurn::Neighbors::KNNClassifier.new(2)
38
+
39
+ model.fit(predictors, target_var)
40
+ ```
41
+
42
+ The model can now be used to predict the income of a person given his/her
43
+ age and years of education.
44
+
45
+ ```ruby
46
+ # predict the job title of person with 4 years of eduction who make $45,000
47
+ model.predict([4, 45000]) # => engineer
48
+ ```
@@ -0,0 +1,48 @@
1
+ ### K Nearest Neighbor Regression
2
+ K Nearest Neighbor (KNN) Regression is one of the simplest forms of regression
3
+ in the machine learning toolbox. Training data is stored on the model and all
4
+ computation is deferred until the time of prediction. When a new observation
5
+ is provided it calculates the distance between the new observation and all
6
+ training data in an n-dimensional space (where n is the number of variables).
7
+ The predicted value is the average value of the k closest training records.
8
+
9
+ Below is a simple example of using KNN Regression in Lurn.
10
+
11
+ Suppose we have a dataset containing the age, years of college eduction and annual
12
+ income for a set of people. We could use this as training data to predict
13
+ people's annual income based on their age and years of eduction.
14
+
15
+ ```ruby
16
+ people = [
17
+ # age years of education annual income
18
+ [ 25, 4, 50000],
19
+ [ 35, 6, 60000],
20
+ [ 51, 2, 40000],
21
+ [ 45, 8, 90000],
22
+ [ 32, 4, 70000],
23
+ ]
24
+
25
+ # extract age and eduction
26
+ predictors = people.map { |person| person[0..1] }
27
+
28
+ # extract annual income
29
+ target_var = people.map { |person| person[2]}
30
+ ```
31
+
32
+ The model can be trained by passing the predictors and target values to an initialized
33
+ instance of the KNNRegression model.
34
+
35
+ ```ruby
36
+ # initialize the model with a k of 2
37
+ model = Lurn::Neighbors::KNNRegression.new(2)
38
+
39
+ model.fit(predictors, target_var)
40
+ ```
41
+
42
+ The model can now be used to predict the income of a person given his/her
43
+ age and years of education.
44
+
45
+ ```ruby
46
+ # predict the income of a 31 year old person with 4 years of eduction
47
+ model.predict([31, 4])
48
+ ```
@@ -0,0 +1,30 @@
1
+ # Bernoulli Vectorizer (word presence vectorizer)
2
+
3
+ A bernoulli document model is one that represents a piece of text as an array of boolean values. Each boolean represents the presence (true) or absence (false) of a word in the document.
4
+
5
+ `Lurn::Text::BernoulliVectorizer` is intended to make it easy to convert text into Bernoulli vectors.
6
+
7
+ ## Basic example
8
+ ```
9
+ docs = ['hello world', 'hello fred']
10
+
11
+ vectorizer = Lurn::Text::BernoulliVectorizer.new
12
+
13
+ # vectorizers must be trained in order to know
14
+ # what features (words) exist in the data set
15
+ vectorizer.fit(docs)
16
+
17
+ vectorizer.transform(docs)
18
+ ```
19
+
20
+ ## Configuration
21
+ The BernoulliVectorizer.new includes a number of options for configuring how documents are vectorized. A few include:
22
+ - max_df[int]: Excludes words which appear in more than `max_df` documents
23
+ - min_df[int]: Excludes words which appear in fewer than `min_df` documents
24
+ - strip_stopwords[boolean]: Removes stop words if true
25
+ - stem_words[boolean]: Stems words in the documents if true
26
+ - ngrams[int]: Features will be determined based on groupings of `ngrams` consecutive words instead of individual words
27
+
28
+ ```
29
+ Lurn::BernoulliVectorizer.new(strip_stopwords: true, min_df: 10)
30
+ ```
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: lurn
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.1
4
+ version: 0.1.2
5
5
  platform: ruby
6
6
  authors:
7
7
  - daniel.carpenter
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2017-12-13 00:00:00.000000000 Z
11
+ date: 2018-08-17 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: terminal-table
@@ -16,14 +16,20 @@ dependencies:
16
16
  requirements:
17
17
  - - "~>"
18
18
  - !ruby/object:Gem::Version
19
- version: 1.7.3
19
+ version: 1.8.0
20
+ - - ">="
21
+ - !ruby/object:Gem::Version
22
+ version: 1.8.0
20
23
  type: :runtime
21
24
  prerelease: false
22
25
  version_requirements: !ruby/object:Gem::Requirement
23
26
  requirements:
24
27
  - - "~>"
25
28
  - !ruby/object:Gem::Version
26
- version: 1.7.3
29
+ version: 1.8.0
30
+ - - ">="
31
+ - !ruby/object:Gem::Version
32
+ version: 1.8.0
27
33
  - !ruby/object:Gem::Dependency
28
34
  name: ruby-stemmer
29
35
  requirement: !ruby/object:Gem::Requirement
@@ -44,14 +50,14 @@ dependencies:
44
50
  requirements:
45
51
  - - "~>"
46
52
  - !ruby/object:Gem::Version
47
- version: 0.1.6
53
+ version: 0.2.1
48
54
  type: :runtime
49
55
  prerelease: false
50
56
  version_requirements: !ruby/object:Gem::Requirement
51
57
  requirements:
52
58
  - - "~>"
53
59
  - !ruby/object:Gem::Version
54
- version: 0.1.6
60
+ version: 0.2.1
55
61
  - !ruby/object:Gem::Dependency
56
62
  name: bundler
57
63
  requirement: !ruby/object:Gem::Requirement
@@ -98,30 +104,70 @@ dependencies:
98
104
  name: awesome_print
99
105
  requirement: !ruby/object:Gem::Requirement
100
106
  requirements:
101
- - - ">="
107
+ - - "~>"
102
108
  - !ruby/object:Gem::Version
103
109
  version: '0'
104
110
  type: :development
105
111
  prerelease: false
106
112
  version_requirements: !ruby/object:Gem::Requirement
107
113
  requirements:
108
- - - ">="
114
+ - - "~>"
109
115
  - !ruby/object:Gem::Version
110
116
  version: '0'
111
117
  - !ruby/object:Gem::Dependency
112
118
  name: byebug
113
119
  requirement: !ruby/object:Gem::Requirement
114
120
  requirements:
121
+ - - "~>"
122
+ - !ruby/object:Gem::Version
123
+ version: '10.0'
115
124
  - - ">="
116
125
  - !ruby/object:Gem::Version
117
- version: '0'
126
+ version: 10.0.2
118
127
  type: :development
119
128
  prerelease: false
120
129
  version_requirements: !ruby/object:Gem::Requirement
121
130
  requirements:
131
+ - - "~>"
132
+ - !ruby/object:Gem::Version
133
+ version: '10.0'
122
134
  - - ">="
123
135
  - !ruby/object:Gem::Version
124
- version: '0'
136
+ version: 10.0.2
137
+ - !ruby/object:Gem::Dependency
138
+ name: rspec_junit_formatter
139
+ requirement: !ruby/object:Gem::Requirement
140
+ requirements:
141
+ - - "~>"
142
+ - !ruby/object:Gem::Version
143
+ version: '0.4'
144
+ - - ">="
145
+ - !ruby/object:Gem::Version
146
+ version: 0.4.1
147
+ type: :development
148
+ prerelease: false
149
+ version_requirements: !ruby/object:Gem::Requirement
150
+ requirements:
151
+ - - "~>"
152
+ - !ruby/object:Gem::Version
153
+ version: '0.4'
154
+ - - ">="
155
+ - !ruby/object:Gem::Version
156
+ version: 0.4.1
157
+ - !ruby/object:Gem::Dependency
158
+ name: yard
159
+ requirement: !ruby/object:Gem::Requirement
160
+ requirements:
161
+ - - "~>"
162
+ - !ruby/object:Gem::Version
163
+ version: 0.9.9
164
+ type: :development
165
+ prerelease: false
166
+ version_requirements: !ruby/object:Gem::Requirement
167
+ requirements:
168
+ - - "~>"
169
+ - !ruby/object:Gem::Version
170
+ version: 0.9.9
125
171
  description: " A gem with tools for machine learning. "
126
172
  email:
127
173
  - daniel.carpenter01@gmail.com
@@ -129,6 +175,7 @@ executables: []
129
175
  extensions: []
130
176
  extra_rdoc_files: []
131
177
  files:
178
+ - ".circleci/config.yml"
132
179
  - ".gitignore"
133
180
  - ".rspec"
134
181
  - ".travis.yml"
@@ -140,11 +187,22 @@ files:
140
187
  - bin/setup
141
188
  - lib/lurn.rb
142
189
  - lib/lurn/evaluation/classifier_evaluator.rb
190
+ - lib/lurn/naive_bayes/base.rb
143
191
  - lib/lurn/naive_bayes/bernoulli_naive_bayes.rb
192
+ - lib/lurn/naive_bayes/multinomial_naive_bayes.rb
193
+ - lib/lurn/neighbors/knn_base.rb
194
+ - lib/lurn/neighbors/knn_classifier.rb
195
+ - lib/lurn/neighbors/knn_regression.rb
144
196
  - lib/lurn/text/bernoulli_vectorizer.rb
197
+ - lib/lurn/text/word_count_vectorizer.rb
145
198
  - lib/lurn/text/word_tokenizer.rb
146
- - lib/lurn/version.rb
147
199
  - lurn.gemspec
200
+ - readmes/evaluation/classifier_evaluator.md
201
+ - readmes/naive_bayes/bernoulli_naive_bayes.md
202
+ - readmes/naive_bayes/multinomial_naive_bayes.md
203
+ - readmes/neighbors/knn_classification.md
204
+ - readmes/neighbors/knn_regression.md
205
+ - readmes/text_processing/bernoulli_vectorizer.md
148
206
  homepage: https://www.github.com/dansbits/lurn
149
207
  licenses:
150
208
  - MIT
@@ -1,3 +0,0 @@
1
- module Lurn
2
- VERSION = "0.1.1"
3
- end