lurn 0.1.1 → 0.1.2
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/.circleci/config.yml +46 -0
- data/README.md +11 -32
- data/lib/lurn.rb +6 -2
- data/lib/lurn/naive_bayes/base.rb +32 -0
- data/lib/lurn/naive_bayes/bernoulli_naive_bayes.rb +17 -44
- data/lib/lurn/naive_bayes/multinomial_naive_bayes.rb +65 -0
- data/lib/lurn/neighbors/knn_base.rb +54 -0
- data/lib/lurn/neighbors/knn_classifier.rb +23 -0
- data/lib/lurn/neighbors/knn_regression.rb +20 -0
- data/lib/lurn/text/bernoulli_vectorizer.rb +3 -6
- data/lib/lurn/text/word_count_vectorizer.rb +65 -0
- data/lib/lurn/text/word_tokenizer.rb +7 -2
- data/lurn.gemspec +7 -9
- data/readmes/evaluation/classifier_evaluator.md +21 -0
- data/readmes/naive_bayes/bernoulli_naive_bayes.md +41 -0
- data/readmes/naive_bayes/multinomial_naive_bayes.md +41 -0
- data/readmes/neighbors/knn_classification.md +48 -0
- data/readmes/neighbors/knn_regression.md +48 -0
- data/readmes/text_processing/bernoulli_vectorizer.md +30 -0
- metadata +69 -11
- data/lib/lurn/version.rb +0 -3
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 74be6a1bd3e76e61d34048367f8fafed76c39a46
|
4
|
+
data.tar.gz: 355ea667da4dd95845d00d8ebebc40636d740cab
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 9dad1d2540818efd226bb029aca818b19f5995a3bfbe77392e2577236038f7c84237cb4b19203e0101da1830338d135ea321e27cbecd32664851994c5a100035
|
7
|
+
data.tar.gz: 4defd5fc70dcfbd3389ab184cf59764cf734bc8bda9347070e0c448cb7286ca7d1cd17ca926785191501ca030a4f8f3c74639fa4271a3f6d13cabe88d2a012d0
|
@@ -0,0 +1,46 @@
|
|
1
|
+
# Ruby CircleCI 2.0 configuration file
|
2
|
+
#
|
3
|
+
# Check https://circleci.com/docs/2.0/language-ruby/ for more details
|
4
|
+
#
|
5
|
+
version: 2
|
6
|
+
jobs:
|
7
|
+
build:
|
8
|
+
docker:
|
9
|
+
# specify the version you desire here
|
10
|
+
- image: circleci/ruby:2.4
|
11
|
+
|
12
|
+
# Specify service dependencies here if necessary
|
13
|
+
# CircleCI maintains a library of pre-built images
|
14
|
+
# documented at https://circleci.com/docs/2.0/circleci-images/
|
15
|
+
# - image: circleci/postgres:9.4
|
16
|
+
|
17
|
+
working_directory: ~/repo
|
18
|
+
|
19
|
+
steps:
|
20
|
+
- checkout
|
21
|
+
|
22
|
+
- run:
|
23
|
+
name: install dependencies
|
24
|
+
command: |
|
25
|
+
bundle install --jobs=4 --retry=3 --path vendor/bundle
|
26
|
+
|
27
|
+
# run tests!
|
28
|
+
- run:
|
29
|
+
name: run tests
|
30
|
+
command: |
|
31
|
+
mkdir /tmp/test-results
|
32
|
+
TEST_FILES="$(circleci tests glob "spec/**/*_spec.rb" | circleci tests split --split-by=timings)"
|
33
|
+
|
34
|
+
bundle exec rspec --format progress \
|
35
|
+
--format RspecJunitFormatter \
|
36
|
+
--out /tmp/test-results/rspec.xml \
|
37
|
+
--format progress \
|
38
|
+
-- \
|
39
|
+
$TEST_FILES
|
40
|
+
|
41
|
+
# collect reports
|
42
|
+
- store_test_results:
|
43
|
+
path: /tmp/test-results
|
44
|
+
- store_artifacts:
|
45
|
+
path: /tmp/test-results
|
46
|
+
destination: test-results
|
data/README.md
CHANGED
@@ -1,6 +1,6 @@
|
|
1
1
|
# Lurn
|
2
2
|
|
3
|
-
Lurn is a ruby gem for performing machine learning. The API and design patterns in Lurn are inspired by
|
3
|
+
Lurn is a ruby gem for performing machine learning tasks. The API and design patterns in Lurn are inspired by scikit-learn, a popular machine learning library for Python.
|
4
4
|
|
5
5
|
## Installation
|
6
6
|
|
@@ -20,37 +20,16 @@ Or install it yourself as:
|
|
20
20
|
|
21
21
|
## Usage
|
22
22
|
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
]
|
33
|
-
|
34
|
-
labels = ['computers','sports','computers','sports']
|
35
|
-
|
36
|
-
# vectorizers take raw data and transform it to a set of features that our
|
37
|
-
# model can understand - in this case an array of boolean values representing
|
38
|
-
# the presence or absence of a word in text
|
39
|
-
vectorizer = Lurn::Text::BernoulliVectorizer.new
|
40
|
-
vectorizer.fit(documents)
|
41
|
-
vectors = vectorizer.transform(documents)
|
42
|
-
|
43
|
-
model = Lurn::NaiveBayes::BernoulliNaiveBayes.new
|
44
|
-
model.fit(vectors, labels)
|
45
|
-
|
46
|
-
new_vectors = vectorizer.transform(['programming is fun'])
|
47
|
-
probabilities = model.predict_probabilities(new_vectors.first)
|
48
|
-
# => [0.9715681919147049, 0.028431808085295614]
|
49
|
-
|
50
|
-
# to get the class of the maximum probability, look at the same index of the
|
51
|
-
# unique_labels attribute on the model
|
52
|
-
model.unique_labels[0] # => 'computers'
|
53
|
-
```
|
23
|
+
- Naive Bayes
|
24
|
+
- [Bernoulli Naive Bayes](readmes/naive_bayes/bernoulli_naive_bayes.md)
|
25
|
+
- [Multinomial Naive Bayes](readmes/naive_bayes/multinomial_naive_bayes.md)
|
26
|
+
- Nearest Neighbor Models
|
27
|
+
- [K Nearest Neighbor Regression](readmes/neighbors/knn_regression.md)
|
28
|
+
- [K Nearest Neighbor Classification](readmes/neighbors/knn_classification.md)
|
29
|
+
- Text Processing
|
30
|
+
- [Bernoulli Vectorizer](readmes/text_processing/bernoulli_vectorizer.md)
|
31
|
+
- Model Evaluation
|
32
|
+
- [ClassifierEvaluator](readmes/evaluation/classifier_evaluator.md)
|
54
33
|
|
55
34
|
## Development
|
56
35
|
|
data/lib/lurn.rb
CHANGED
@@ -1,10 +1,14 @@
|
|
1
1
|
require "daru"
|
2
|
-
require "lurn/version"
|
3
2
|
require "lurn/text/word_tokenizer"
|
4
3
|
require "lurn/text/bernoulli_vectorizer"
|
4
|
+
require "lurn/text/word_count_vectorizer"
|
5
|
+
require "lurn/naive_bayes/base"
|
5
6
|
require "lurn/naive_bayes/bernoulli_naive_bayes"
|
7
|
+
require "lurn/naive_bayes/multinomial_naive_bayes"
|
6
8
|
require "lurn/evaluation/classifier_evaluator"
|
9
|
+
require "lurn/neighbors/knn_base"
|
10
|
+
require "lurn/neighbors/knn_regression"
|
11
|
+
require "lurn/neighbors/knn_classifier"
|
7
12
|
|
8
13
|
module Lurn
|
9
|
-
# Your code goes here...
|
10
14
|
end
|
@@ -0,0 +1,32 @@
|
|
1
|
+
module Lurn
|
2
|
+
module NaiveBayes
|
3
|
+
class Base
|
4
|
+
def predict_probabilities(vector)
|
5
|
+
log_probabilties = predict_log_probabilities(vector)
|
6
|
+
|
7
|
+
log_probabilties.map { |p| Math.exp(p) }
|
8
|
+
end
|
9
|
+
|
10
|
+
def max_class(vector)
|
11
|
+
log_probs = predict_log_probabilities(vector)
|
12
|
+
|
13
|
+
max_index = log_probs.index(log_probs.max)
|
14
|
+
|
15
|
+
unique_labels[max_index]
|
16
|
+
end
|
17
|
+
|
18
|
+
def max_probability(vector)
|
19
|
+
probs = predict_probabilities(vector)
|
20
|
+
|
21
|
+
probs.max
|
22
|
+
end
|
23
|
+
|
24
|
+
def predict_log_probabilities(vector)
|
25
|
+
vector = Vector.elements(vector)
|
26
|
+
jll = joint_log_likelihood(vector)
|
27
|
+
log_prob_x = Math.log(jll.map { |v| Math.exp(v) }.inject(:+))
|
28
|
+
jll.map{ |v| v - log_prob_x }
|
29
|
+
end
|
30
|
+
end
|
31
|
+
end
|
32
|
+
end
|
@@ -2,7 +2,7 @@ require 'matrix'
|
|
2
2
|
|
3
3
|
module Lurn
|
4
4
|
module NaiveBayes
|
5
|
-
class BernoulliNaiveBayes
|
5
|
+
class BernoulliNaiveBayes < Base
|
6
6
|
|
7
7
|
attr_accessor :probability_matrix, :label_probabilities, :unique_labels
|
8
8
|
|
@@ -19,38 +19,7 @@ module Lurn
|
|
19
19
|
document_count_matrix = build_document_count_matrix(vectors, labels)
|
20
20
|
@probability_matrix = build_probability_matrix(document_count_matrix, labels)
|
21
21
|
|
22
|
-
@label_probabilities = @unique_labels.map { |l1| labels.
|
23
|
-
end
|
24
|
-
|
25
|
-
def predict_probabilities(vector)
|
26
|
-
log_probabilties = predict_log_probabilities(vector)
|
27
|
-
|
28
|
-
log_probabilties.map { |p| Math.exp(p) }
|
29
|
-
end
|
30
|
-
|
31
|
-
def predict_log_probabilities(vector)
|
32
|
-
|
33
|
-
probabilities = @unique_labels.map do |label|
|
34
|
-
joint_log_likelihood(vector, label)
|
35
|
-
end
|
36
|
-
|
37
|
-
log_prob_x = Math.log(probabilities.map { |v| Math.exp(v) }.sum)
|
38
|
-
|
39
|
-
probabilities.map { |p| p - log_prob_x }
|
40
|
-
end
|
41
|
-
|
42
|
-
def max_class(vector)
|
43
|
-
log_probs = predict_log_probabilities(vector)
|
44
|
-
|
45
|
-
max_index = log_probs.index(log_probs.max)
|
46
|
-
|
47
|
-
unique_labels[max_index]
|
48
|
-
end
|
49
|
-
|
50
|
-
def max_probability(vector)
|
51
|
-
probs = predict_probabilities(vector)
|
52
|
-
|
53
|
-
probs.max
|
22
|
+
@label_probabilities = @unique_labels.map { |l1| labels.count { |l2| l1 == l2 }.to_f / labels.count.to_f }
|
54
23
|
end
|
55
24
|
|
56
25
|
def to_h
|
@@ -64,11 +33,11 @@ module Lurn
|
|
64
33
|
private
|
65
34
|
|
66
35
|
def build_probability_matrix(document_count_matrix, labels)
|
67
|
-
probability_matrix = Array.new(@unique_labels.count) { Array.new(@feature_count
|
36
|
+
probability_matrix = Array.new(@unique_labels.count) { Array.new(@feature_count, 0.0) }
|
68
37
|
|
69
38
|
document_count_matrix.each_with_index do |value, row, col|
|
70
39
|
label = @unique_labels[row]
|
71
|
-
label_frequency = labels.
|
40
|
+
label_frequency = labels.count(label)
|
72
41
|
|
73
42
|
probability_matrix[row][col] = Math.log((value.to_f + @k) / (label_frequency.to_f + (2.0 * @k)))
|
74
43
|
end
|
@@ -77,7 +46,7 @@ module Lurn
|
|
77
46
|
end
|
78
47
|
|
79
48
|
def build_document_count_matrix(vectors, labels)
|
80
|
-
matrix = Array.new(@unique_labels.count) { Array.new(@feature_count
|
49
|
+
matrix = Array.new(@unique_labels.count) { Array.new(@feature_count, 0) }
|
81
50
|
|
82
51
|
vectors.each_with_index do |value, row, col|
|
83
52
|
if value == true
|
@@ -90,16 +59,20 @@ module Lurn
|
|
90
59
|
Matrix.rows(matrix)
|
91
60
|
end
|
92
61
|
|
93
|
-
def joint_log_likelihood(
|
94
|
-
|
62
|
+
def joint_log_likelihood(x)
|
63
|
+
jlls = []
|
95
64
|
|
96
|
-
|
97
|
-
|
98
|
-
|
99
|
-
|
100
|
-
|
65
|
+
unique_labels.each_with_index do |label, label_index|
|
66
|
+
vector = Vector.elements(x.map { |e| e == true ? 1 : 0 })
|
67
|
+
probabilities = @probability_matrix.row(label_index)
|
68
|
+
neg_probs = probabilities.map { |prb| Math.log(1.0 - Math.exp(prb)) }
|
69
|
+
jll = vector.dot(probabilities - neg_probs)
|
70
|
+
jll += Math.log(@label_probabilities[label_index]) + neg_probs.inject(:+)
|
71
|
+
|
72
|
+
jlls.push jll
|
73
|
+
end
|
101
74
|
|
102
|
-
|
75
|
+
jlls
|
103
76
|
end
|
104
77
|
|
105
78
|
end
|
@@ -0,0 +1,65 @@
|
|
1
|
+
module Lurn
|
2
|
+
module NaiveBayes
|
3
|
+
class MultinomialNaiveBayes < Base
|
4
|
+
|
5
|
+
attr_accessor :prior_probabilities, :probability_matrix, :unique_labels
|
6
|
+
|
7
|
+
def initialize
|
8
|
+
|
9
|
+
end
|
10
|
+
|
11
|
+
def fit(vectors, labels)
|
12
|
+
vectors = Matrix.rows(vectors)
|
13
|
+
|
14
|
+
@unique_labels = labels.uniq
|
15
|
+
@feature_count = vectors.column_size
|
16
|
+
count_matrix = build_count_matrix(vectors, labels)
|
17
|
+
@probability_matrix = build_probability_matrix(count_matrix, labels)
|
18
|
+
@prior_probabilities = @unique_labels.map do |l1|
|
19
|
+
labels.count { |l2| l1 == l2 }.to_f / labels.count.to_f
|
20
|
+
end
|
21
|
+
end
|
22
|
+
|
23
|
+
private
|
24
|
+
|
25
|
+
def build_probability_matrix(count_matrix, labels)
|
26
|
+
probability_matrix = Array.new(@unique_labels.count) { Array.new(@feature_count, 0.0) }
|
27
|
+
|
28
|
+
count_matrix.each_with_index do |value, row, col|
|
29
|
+
label = @unique_labels[row]
|
30
|
+
label_frequency = labels.count(label)
|
31
|
+
|
32
|
+
numerator = (value.to_f + 1.0)
|
33
|
+
denominator = count_matrix.row(row).inject(:+) + @feature_count
|
34
|
+
probability_matrix[row][col] = Math.log(numerator / denominator)
|
35
|
+
end
|
36
|
+
|
37
|
+
probability_matrix
|
38
|
+
end
|
39
|
+
|
40
|
+
def build_count_matrix(vectors, labels)
|
41
|
+
matrix = Array.new(@unique_labels.count) { Array.new(@feature_count, 0) }
|
42
|
+
|
43
|
+
vectors.each_with_index do |value, row, col|
|
44
|
+
label = labels[row]
|
45
|
+
label_index = @unique_labels.index(label)
|
46
|
+
matrix[label_index][col] += value
|
47
|
+
end
|
48
|
+
|
49
|
+
Matrix.rows(matrix)
|
50
|
+
end
|
51
|
+
|
52
|
+
def joint_log_likelihood(vector)
|
53
|
+
jlls = []
|
54
|
+
@unique_labels.each_with_index do |label, label_index|
|
55
|
+
probabilities = @probability_matrix[label_index]
|
56
|
+
jll = vector.dot(probabilities)
|
57
|
+
jll += Math.log(@prior_probabilities[label_index])
|
58
|
+
jlls.push(jll)
|
59
|
+
end
|
60
|
+
|
61
|
+
jlls
|
62
|
+
end
|
63
|
+
end
|
64
|
+
end
|
65
|
+
end
|
@@ -0,0 +1,54 @@
|
|
1
|
+
module Lurn
|
2
|
+
module Neighbors
|
3
|
+
class KNNBase
|
4
|
+
|
5
|
+
attr_accessor :predictors, :targets, :k
|
6
|
+
|
7
|
+
def initialize(k)
|
8
|
+
@k = k
|
9
|
+
end
|
10
|
+
|
11
|
+
# Trains the KNN regression model to predict the target variable
|
12
|
+
# based on the predictors. For KNN Regression all computation is
|
13
|
+
# deferred until the time of prediction so in this case the data
|
14
|
+
# is just stored.
|
15
|
+
#
|
16
|
+
# @param predictors [Array-like] An array of arrays containing the predictor data
|
17
|
+
# @param targets [Array-like] An array with the value you want to predict
|
18
|
+
def fit(predictors, targets)
|
19
|
+
@predictors = predictors.map { |pred| Vector.elements(pred) }
|
20
|
+
@targets = targets
|
21
|
+
|
22
|
+
nil
|
23
|
+
end
|
24
|
+
|
25
|
+
# Returns the predictors and target value for the k nearest neighbors for the vector parameter
|
26
|
+
#
|
27
|
+
# @param vector [Array-like] An array of the same length and type as the predictors used to train the model
|
28
|
+
# @return [Array, Array]
|
29
|
+
# Returns two values. The first is an array of the predictors for the k nearest neighbors. The second is an
|
30
|
+
# array of the corresponding target values for the k nearest neighbors.
|
31
|
+
def nearest_neighbors(vector)
|
32
|
+
vector = Vector.elements(vector)
|
33
|
+
|
34
|
+
distances = @predictors.map.with_index do |p, index|
|
35
|
+
{ index: index, distance: euclidian_distance(p, vector), value: targets[index] }
|
36
|
+
end
|
37
|
+
|
38
|
+
distances.sort! { |x,y| x[:distance] <=> y[:distance] }
|
39
|
+
|
40
|
+
neighboring_predictors = distances.first(@k).map { |neighbor| @predictors[neighbor[:index]] }
|
41
|
+
neighboring_targets = distances.first(@k).map { |neighbor| @targets[neighbor[:index]] }
|
42
|
+
|
43
|
+
return neighboring_predictors, neighboring_targets
|
44
|
+
end
|
45
|
+
|
46
|
+
private
|
47
|
+
|
48
|
+
def euclidian_distance(vector1, vector2)
|
49
|
+
Math.sqrt((vector1 - vector2).map { |v| (v.abs)**2 }.inject(:+))
|
50
|
+
end
|
51
|
+
|
52
|
+
end
|
53
|
+
end
|
54
|
+
end
|
@@ -0,0 +1,23 @@
|
|
1
|
+
module Lurn
|
2
|
+
module Neighbors
|
3
|
+
class KNNClassifier < KNNBase
|
4
|
+
|
5
|
+
# Predicts the class of the given observation by selecting the most common class of the
|
6
|
+
# closest k training observations based on euclidian distance. In the case of a tie one winner
|
7
|
+
# will be chosen at random from the most frequent classes.
|
8
|
+
#
|
9
|
+
# @param vector [Array-like]
|
10
|
+
# An array (or array-like) of the same length as the predictors used
|
11
|
+
# to fit the model
|
12
|
+
# @return [Object] The predicted class
|
13
|
+
def predict(vector)
|
14
|
+
_, neighboring_targets = nearest_neighbors(vector)
|
15
|
+
|
16
|
+
class_frequencies = neighboring_targets.inject(Hash.new(0)) { |h,v| h[v] += 1; h }
|
17
|
+
|
18
|
+
neighboring_targets.max_by { |v| class_frequencies[v] }
|
19
|
+
end
|
20
|
+
|
21
|
+
end
|
22
|
+
end
|
23
|
+
end
|
@@ -0,0 +1,20 @@
|
|
1
|
+
module Lurn
|
2
|
+
module Neighbors
|
3
|
+
class KNNRegression < KNNBase
|
4
|
+
|
5
|
+
# Predicts the value of the given observation by averaging the target value of the
|
6
|
+
# closest k predictor observations based on euclidian distance.
|
7
|
+
#
|
8
|
+
# @param vector [Array-like]
|
9
|
+
# An array (or array-like) of the same length as the predictors used
|
10
|
+
# to fit the model
|
11
|
+
# @return [Float] The predicted value
|
12
|
+
def predict(vector)
|
13
|
+
_, neighboring_targets = nearest_neighbors(vector)
|
14
|
+
|
15
|
+
neighboring_targets.inject(:+).to_f / neighboring_targets.length.to_f
|
16
|
+
end
|
17
|
+
|
18
|
+
end
|
19
|
+
end
|
20
|
+
end
|
@@ -17,7 +17,7 @@ module Lurn
|
|
17
17
|
def fit(documents)
|
18
18
|
@vocabulary = []
|
19
19
|
tokenized_docs = tokenize_documents(documents)
|
20
|
-
@vocabulary = tokenized_docs.flatten.uniq.sort
|
20
|
+
@vocabulary = tokenized_docs.flatten(1).uniq.sort
|
21
21
|
reduce_features(tokenized_docs)
|
22
22
|
end
|
23
23
|
|
@@ -49,12 +49,9 @@ module Lurn
|
|
49
49
|
end
|
50
50
|
end
|
51
51
|
|
52
|
-
reduced_features =
|
53
|
-
@vocabulary.each_with_index do |token, index|
|
52
|
+
reduced_features = @vocabulary.select.with_index do |token, index|
|
54
53
|
freq = doc_frequencies[index]
|
55
|
-
|
56
|
-
reduced_features.push token
|
57
|
-
end
|
54
|
+
@options[:min_df] < freq && freq < @options[:max_df]
|
58
55
|
end
|
59
56
|
|
60
57
|
@vocabulary = reduced_features
|
@@ -0,0 +1,65 @@
|
|
1
|
+
module Lurn
|
2
|
+
module Text
|
3
|
+
class WordCountVectorizer
|
4
|
+
|
5
|
+
attr_accessor :tokenizer
|
6
|
+
attr_accessor :vocabulary
|
7
|
+
|
8
|
+
def initialize(options = {})
|
9
|
+
@tokenizer = options[:tokenizer] || WordTokenizer.new
|
10
|
+
@vocabulary = []
|
11
|
+
|
12
|
+
options[:max_df] ||= 50
|
13
|
+
options[:min_df] ||= 0
|
14
|
+
@options = options
|
15
|
+
end
|
16
|
+
|
17
|
+
def fit(documents)
|
18
|
+
@vocabulary = []
|
19
|
+
tokenized_docs = tokenize_documents(documents)
|
20
|
+
@vocabulary = tokenized_docs.flatten(1).uniq.sort
|
21
|
+
reduce_features(tokenized_docs)
|
22
|
+
end
|
23
|
+
|
24
|
+
def to_h
|
25
|
+
{
|
26
|
+
tokenizer_options: @tokenizer.to_h,
|
27
|
+
vocabulary: @vocabulary
|
28
|
+
}
|
29
|
+
end
|
30
|
+
|
31
|
+
def transform(documents)
|
32
|
+
documents.map do |document|
|
33
|
+
tokens = @tokenizer.tokenize(document)
|
34
|
+
@vocabulary.map do |word|
|
35
|
+
tokens.count word
|
36
|
+
end
|
37
|
+
end
|
38
|
+
end
|
39
|
+
|
40
|
+
private
|
41
|
+
|
42
|
+
def reduce_features(tokenized_docs)
|
43
|
+
doc_frequencies = Array.new(@vocabulary.length, 0)
|
44
|
+
|
45
|
+
tokenized_docs.each do |tokens|
|
46
|
+
tokens.each do |token|
|
47
|
+
vocab_index = @vocabulary.index(token)
|
48
|
+
doc_frequencies[vocab_index] += 1
|
49
|
+
end
|
50
|
+
end
|
51
|
+
|
52
|
+
reduced_features = @vocabulary.select.with_index do |token, index|
|
53
|
+
freq = doc_frequencies[index]
|
54
|
+
@options[:min_df] < freq && freq < @options[:max_df]
|
55
|
+
end
|
56
|
+
|
57
|
+
@vocabulary = reduced_features
|
58
|
+
end
|
59
|
+
|
60
|
+
def tokenize_documents(documents)
|
61
|
+
documents.map { |doc| @tokenizer.tokenize(doc).uniq }
|
62
|
+
end
|
63
|
+
end
|
64
|
+
end
|
65
|
+
end
|
@@ -21,17 +21,22 @@ module Lurn
|
|
21
21
|
@options[:strip_punctuation] ||= false
|
22
22
|
@options[:strip_stopwords] ||= false
|
23
23
|
@options[:stem_words] ||= false
|
24
|
+
@options[:ngrams] ||= 1
|
24
25
|
end
|
25
26
|
|
26
27
|
def tokenize(document)
|
27
|
-
document = document.gsub(/[[:punct:]]/,
|
28
|
-
document = document.
|
28
|
+
document = document.gsub(/[[:punct:]]/, "") if @options[:strip_punctuation] == true
|
29
|
+
document = document.split("\s")
|
29
30
|
|
30
31
|
if(@options[:stem_words])
|
31
32
|
stemmer = Lingua::Stemmer.new(language: :en)
|
32
33
|
document = document.map { |word| stemmer.stem(word) }
|
33
34
|
end
|
34
35
|
|
36
|
+
if(@options[:ngrams] > 1)
|
37
|
+
document = document.each_cons(@options[:ngrams]).to_a
|
38
|
+
end
|
39
|
+
|
35
40
|
document
|
36
41
|
end
|
37
42
|
|
data/lurn.gemspec
CHANGED
@@ -1,11 +1,7 @@
|
|
1
1
|
# coding: utf-8
|
2
|
-
lib = File.expand_path('../lib', __FILE__)
|
3
|
-
$LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
|
4
|
-
require 'lurn/version'
|
5
|
-
|
6
2
|
Gem::Specification.new do |spec|
|
7
3
|
spec.name = "lurn"
|
8
|
-
spec.version =
|
4
|
+
spec.version = "0.1.2"
|
9
5
|
spec.authors = ["daniel.carpenter"]
|
10
6
|
spec.email = ["daniel.carpenter01@gmail.com"]
|
11
7
|
|
@@ -21,13 +17,15 @@ Gem::Specification.new do |spec|
|
|
21
17
|
spec.executables = spec.files.grep(%r{^exe/}) { |f| File.basename(f) }
|
22
18
|
spec.require_paths = ["lib"]
|
23
19
|
|
24
|
-
spec.add_dependency "terminal-table", "~> 1.
|
20
|
+
spec.add_dependency "terminal-table", "~> 1.8.0", '>= 1.8.0'
|
25
21
|
spec.add_dependency "ruby-stemmer", "~> 0.9.6"
|
26
|
-
spec.add_dependency "daru",
|
22
|
+
spec.add_dependency "daru", "~> 0.2.1"
|
27
23
|
|
28
24
|
spec.add_development_dependency "bundler", "~> 1.13"
|
29
25
|
spec.add_development_dependency "rake", "~> 10.0"
|
30
26
|
spec.add_development_dependency "rspec", "~> 3.0"
|
31
|
-
spec.add_development_dependency "awesome_print"
|
32
|
-
spec.add_development_dependency "byebug"
|
27
|
+
spec.add_development_dependency "awesome_print", "~> 0"
|
28
|
+
spec.add_development_dependency "byebug", "~> 10.0", ">= 10.0.2"
|
29
|
+
spec.add_development_dependency "rspec_junit_formatter", "~> 0.4", ">= 0.4.1"
|
30
|
+
spec.add_development_dependency "yard", "~> 0.9.9"
|
33
31
|
end
|
@@ -0,0 +1,21 @@
|
|
1
|
+
# Classifier Evaluator
|
2
|
+
`Lurn::Evaluation::ClassifierEvaluator` provides some basic functionality for evaluating the performance of a classifier.
|
3
|
+
|
4
|
+
## Example
|
5
|
+
```
|
6
|
+
actual_class = ['sports','science','science','sports']
|
7
|
+
predicted_class = ['sports','sports','science','sports']
|
8
|
+
|
9
|
+
eval = Lurn::Evaluation::ClassifierEvaluator.new predicted_class, actual_class
|
10
|
+
|
11
|
+
print eval.summary
|
12
|
+
|
13
|
+
# output
|
14
|
+
+-----------------+--------------------+--------+
|
15
|
+
| Class | Precision | Recall |
|
16
|
+
+-----------------+--------------------+--------+
|
17
|
+
| sports | 0.6666666666666666 | 1.0 |
|
18
|
+
| science | 1.0 | 0.5 |
|
19
|
+
| Overall Average | 0.8333333333333333 | 0.75 |
|
20
|
+
+-----------------+--------------------+--------+
|
21
|
+
```
|
@@ -0,0 +1,41 @@
|
|
1
|
+
### Bernoulli Naive Bayes
|
2
|
+
Naive bayes is a bayesian model often used for text classification. Bernoulli Naive Bayes specifically classifies observations based on the presence or absence of a feature in an observation.
|
3
|
+
|
4
|
+
Below is a simple text classification using Naive Bayes in Lurn.
|
5
|
+
|
6
|
+
1. Start with some text documents
|
7
|
+
|
8
|
+
```ruby
|
9
|
+
documents = [
|
10
|
+
'ruby is a great programming language',
|
11
|
+
'the giants recently won the world series',
|
12
|
+
'java is a compiled programming language',
|
13
|
+
'the jets are a football team'
|
14
|
+
]
|
15
|
+
|
16
|
+
labels = ['computers','sports','computers','sports']
|
17
|
+
```
|
18
|
+
|
19
|
+
2. Convert them to arrays of booleans representing which words they contain (or don't contain). Lurn provides vectorizers for this purpose.
|
20
|
+
```
|
21
|
+
vectorizer = Lurn::Text::BernoulliVectorizer.new
|
22
|
+
vectorizer.fit(documents)
|
23
|
+
vectors = vectorizer.transform(documents)
|
24
|
+
```
|
25
|
+
|
26
|
+
3. Initialize and train the model
|
27
|
+
```
|
28
|
+
model = Lurn::NaiveBayes::BernoulliNaiveBayes.new
|
29
|
+
model.fit(vectors, labels)
|
30
|
+
```
|
31
|
+
|
32
|
+
4. Classify a new document
|
33
|
+
```
|
34
|
+
new_vectors = vectorizer.transform(['programming is fun'])
|
35
|
+
|
36
|
+
# get the most probable class for the new document given the training data
|
37
|
+
model.max_class(new_vectors.first)
|
38
|
+
|
39
|
+
# get the probability score for the most probable class
|
40
|
+
model.max_probability(new_vectors.first)
|
41
|
+
```
|
@@ -0,0 +1,41 @@
|
|
1
|
+
### Multinomial Naive Bayes
|
2
|
+
Naive bayes is a bayesian model often used for text classification. Multinomial Naive Bayes specifically classifies observations based on variables with a multinomial distribution (a.k.a. numbers).
|
3
|
+
|
4
|
+
Below is a simple text classification using Multinomial Naive Bayes in Lurn.
|
5
|
+
|
6
|
+
1. Start with some text documents
|
7
|
+
|
8
|
+
```ruby
|
9
|
+
documents = [
|
10
|
+
'ruby is a great programming language',
|
11
|
+
'the giants recently won the world series',
|
12
|
+
'java is a compiled programming language',
|
13
|
+
'the jets are a football team'
|
14
|
+
]
|
15
|
+
|
16
|
+
labels = ['computers','sports','computers','sports']
|
17
|
+
```
|
18
|
+
|
19
|
+
2. Convert them to arrays of booleans representing which words they contain (or don't contain). Lurn provides vectorizers for this purpose.
|
20
|
+
```
|
21
|
+
vectorizer = Lurn::Text::WordCountVectorizer.new
|
22
|
+
vectorizer.fit(documents)
|
23
|
+
vectors = vectorizer.transform(documents)
|
24
|
+
```
|
25
|
+
|
26
|
+
3. Initialize and train the model
|
27
|
+
```
|
28
|
+
model = Lurn::NaiveBayes::MultinomialNaiveBayes.new
|
29
|
+
model.fit(vectors, labels)
|
30
|
+
```
|
31
|
+
|
32
|
+
4. Classify a new document
|
33
|
+
```
|
34
|
+
new_vectors = vectorizer.transform(['programming is fun'])
|
35
|
+
|
36
|
+
# get the most probable class for the new document given the training data
|
37
|
+
model.max_class(new_vectors.first)
|
38
|
+
|
39
|
+
# get the probability score for the most probable class
|
40
|
+
model.max_probability(new_vectors.first)
|
41
|
+
```
|
@@ -0,0 +1,48 @@
|
|
1
|
+
### K Nearest Neighbor Classifier
|
2
|
+
K Nearest Neighbor (KNN) Classification is one of the simplest forms of classification
|
3
|
+
in the machine learning toolbox. Training data is stored on the model and all
|
4
|
+
computation is deferred until the time of prediction. When a new observation
|
5
|
+
is provided it calculates the distance between the new observation and all
|
6
|
+
training data in an n-dimensional space (where n is the number of variables).
|
7
|
+
The predicted class is the most common class among the k closest training records.
|
8
|
+
|
9
|
+
Below is a simple example of using KNN Classification in Lurn.
|
10
|
+
|
11
|
+
Suppose we have a dataset containing the income, years of college eduction and job title
|
12
|
+
for a set of people. We could use this as training data to predict
|
13
|
+
people's job title based on their income and years of eduction.
|
14
|
+
|
15
|
+
```ruby
|
16
|
+
people = [
|
17
|
+
# years of education annual income job title
|
18
|
+
[ 4, 50000, 'engineer'],
|
19
|
+
[ 6, 60000, 'scientist'],
|
20
|
+
[ 2, 40000, 'engineer'],
|
21
|
+
[ 8, 90000, 'scientist'],
|
22
|
+
[ 4, 70000, 'librarian'],
|
23
|
+
]
|
24
|
+
|
25
|
+
# eduction and income
|
26
|
+
predictors = people.map { |person| person[0..1] }
|
27
|
+
|
28
|
+
# extract annual income
|
29
|
+
target_var = people.map { |person| person[2]}
|
30
|
+
```
|
31
|
+
|
32
|
+
The model can be trained by passing the predictors and target values to an initialized
|
33
|
+
instance of the KNNClassifier model.
|
34
|
+
|
35
|
+
```ruby
|
36
|
+
# initialize the model with a k of 2
|
37
|
+
model = Lurn::Neighbors::KNNClassifier.new(2)
|
38
|
+
|
39
|
+
model.fit(predictors, target_var)
|
40
|
+
```
|
41
|
+
|
42
|
+
The model can now be used to predict the income of a person given his/her
|
43
|
+
age and years of education.
|
44
|
+
|
45
|
+
```ruby
|
46
|
+
# predict the job title of person with 4 years of eduction who make $45,000
|
47
|
+
model.predict([4, 45000]) # => engineer
|
48
|
+
```
|
@@ -0,0 +1,48 @@
|
|
1
|
+
### K Nearest Neighbor Regression
|
2
|
+
K Nearest Neighbor (KNN) Regression is one of the simplest forms of regression
|
3
|
+
in the machine learning toolbox. Training data is stored on the model and all
|
4
|
+
computation is deferred until the time of prediction. When a new observation
|
5
|
+
is provided it calculates the distance between the new observation and all
|
6
|
+
training data in an n-dimensional space (where n is the number of variables).
|
7
|
+
The predicted value is the average value of the k closest training records.
|
8
|
+
|
9
|
+
Below is a simple example of using KNN Regression in Lurn.
|
10
|
+
|
11
|
+
Suppose we have a dataset containing the age, years of college eduction and annual
|
12
|
+
income for a set of people. We could use this as training data to predict
|
13
|
+
people's annual income based on their age and years of eduction.
|
14
|
+
|
15
|
+
```ruby
|
16
|
+
people = [
|
17
|
+
# age years of education annual income
|
18
|
+
[ 25, 4, 50000],
|
19
|
+
[ 35, 6, 60000],
|
20
|
+
[ 51, 2, 40000],
|
21
|
+
[ 45, 8, 90000],
|
22
|
+
[ 32, 4, 70000],
|
23
|
+
]
|
24
|
+
|
25
|
+
# extract age and eduction
|
26
|
+
predictors = people.map { |person| person[0..1] }
|
27
|
+
|
28
|
+
# extract annual income
|
29
|
+
target_var = people.map { |person| person[2]}
|
30
|
+
```
|
31
|
+
|
32
|
+
The model can be trained by passing the predictors and target values to an initialized
|
33
|
+
instance of the KNNRegression model.
|
34
|
+
|
35
|
+
```ruby
|
36
|
+
# initialize the model with a k of 2
|
37
|
+
model = Lurn::Neighbors::KNNRegression.new(2)
|
38
|
+
|
39
|
+
model.fit(predictors, target_var)
|
40
|
+
```
|
41
|
+
|
42
|
+
The model can now be used to predict the income of a person given his/her
|
43
|
+
age and years of education.
|
44
|
+
|
45
|
+
```ruby
|
46
|
+
# predict the income of a 31 year old person with 4 years of eduction
|
47
|
+
model.predict([31, 4])
|
48
|
+
```
|
@@ -0,0 +1,30 @@
|
|
1
|
+
# Bernoulli Vectorizer (word presence vectorizer)
|
2
|
+
|
3
|
+
A bernoulli document model is one that represents a piece of text as an array of boolean values. Each boolean represents the presence (true) or absence (false) of a word in the document.
|
4
|
+
|
5
|
+
`Lurn::Text::BernoulliVectorizer` is intended to make it easy to convert text into Bernoulli vectors.
|
6
|
+
|
7
|
+
## Basic example
|
8
|
+
```
|
9
|
+
docs = ['hello world', 'hello fred']
|
10
|
+
|
11
|
+
vectorizer = Lurn::Text::BernoulliVectorizer.new
|
12
|
+
|
13
|
+
# vectorizers must be trained in order to know
|
14
|
+
# what features (words) exist in the data set
|
15
|
+
vectorizer.fit(docs)
|
16
|
+
|
17
|
+
vectorizer.transform(docs)
|
18
|
+
```
|
19
|
+
|
20
|
+
## Configuration
|
21
|
+
The BernoulliVectorizer.new includes a number of options for configuring how documents are vectorized. A few include:
|
22
|
+
- max_df[int]: Excludes words which appear in more than `max_df` documents
|
23
|
+
- min_df[int]: Excludes words which appear in fewer than `min_df` documents
|
24
|
+
- strip_stopwords[boolean]: Removes stop words if true
|
25
|
+
- stem_words[boolean]: Stems words in the documents if true
|
26
|
+
- ngrams[int]: Features will be determined based on groupings of `ngrams` consecutive words instead of individual words
|
27
|
+
|
28
|
+
```
|
29
|
+
Lurn::BernoulliVectorizer.new(strip_stopwords: true, min_df: 10)
|
30
|
+
```
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: lurn
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.1.
|
4
|
+
version: 0.1.2
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- daniel.carpenter
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date:
|
11
|
+
date: 2018-08-17 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: terminal-table
|
@@ -16,14 +16,20 @@ dependencies:
|
|
16
16
|
requirements:
|
17
17
|
- - "~>"
|
18
18
|
- !ruby/object:Gem::Version
|
19
|
-
version: 1.
|
19
|
+
version: 1.8.0
|
20
|
+
- - ">="
|
21
|
+
- !ruby/object:Gem::Version
|
22
|
+
version: 1.8.0
|
20
23
|
type: :runtime
|
21
24
|
prerelease: false
|
22
25
|
version_requirements: !ruby/object:Gem::Requirement
|
23
26
|
requirements:
|
24
27
|
- - "~>"
|
25
28
|
- !ruby/object:Gem::Version
|
26
|
-
version: 1.
|
29
|
+
version: 1.8.0
|
30
|
+
- - ">="
|
31
|
+
- !ruby/object:Gem::Version
|
32
|
+
version: 1.8.0
|
27
33
|
- !ruby/object:Gem::Dependency
|
28
34
|
name: ruby-stemmer
|
29
35
|
requirement: !ruby/object:Gem::Requirement
|
@@ -44,14 +50,14 @@ dependencies:
|
|
44
50
|
requirements:
|
45
51
|
- - "~>"
|
46
52
|
- !ruby/object:Gem::Version
|
47
|
-
version: 0.1
|
53
|
+
version: 0.2.1
|
48
54
|
type: :runtime
|
49
55
|
prerelease: false
|
50
56
|
version_requirements: !ruby/object:Gem::Requirement
|
51
57
|
requirements:
|
52
58
|
- - "~>"
|
53
59
|
- !ruby/object:Gem::Version
|
54
|
-
version: 0.1
|
60
|
+
version: 0.2.1
|
55
61
|
- !ruby/object:Gem::Dependency
|
56
62
|
name: bundler
|
57
63
|
requirement: !ruby/object:Gem::Requirement
|
@@ -98,30 +104,70 @@ dependencies:
|
|
98
104
|
name: awesome_print
|
99
105
|
requirement: !ruby/object:Gem::Requirement
|
100
106
|
requirements:
|
101
|
-
- - "
|
107
|
+
- - "~>"
|
102
108
|
- !ruby/object:Gem::Version
|
103
109
|
version: '0'
|
104
110
|
type: :development
|
105
111
|
prerelease: false
|
106
112
|
version_requirements: !ruby/object:Gem::Requirement
|
107
113
|
requirements:
|
108
|
-
- - "
|
114
|
+
- - "~>"
|
109
115
|
- !ruby/object:Gem::Version
|
110
116
|
version: '0'
|
111
117
|
- !ruby/object:Gem::Dependency
|
112
118
|
name: byebug
|
113
119
|
requirement: !ruby/object:Gem::Requirement
|
114
120
|
requirements:
|
121
|
+
- - "~>"
|
122
|
+
- !ruby/object:Gem::Version
|
123
|
+
version: '10.0'
|
115
124
|
- - ">="
|
116
125
|
- !ruby/object:Gem::Version
|
117
|
-
version:
|
126
|
+
version: 10.0.2
|
118
127
|
type: :development
|
119
128
|
prerelease: false
|
120
129
|
version_requirements: !ruby/object:Gem::Requirement
|
121
130
|
requirements:
|
131
|
+
- - "~>"
|
132
|
+
- !ruby/object:Gem::Version
|
133
|
+
version: '10.0'
|
122
134
|
- - ">="
|
123
135
|
- !ruby/object:Gem::Version
|
124
|
-
version:
|
136
|
+
version: 10.0.2
|
137
|
+
- !ruby/object:Gem::Dependency
|
138
|
+
name: rspec_junit_formatter
|
139
|
+
requirement: !ruby/object:Gem::Requirement
|
140
|
+
requirements:
|
141
|
+
- - "~>"
|
142
|
+
- !ruby/object:Gem::Version
|
143
|
+
version: '0.4'
|
144
|
+
- - ">="
|
145
|
+
- !ruby/object:Gem::Version
|
146
|
+
version: 0.4.1
|
147
|
+
type: :development
|
148
|
+
prerelease: false
|
149
|
+
version_requirements: !ruby/object:Gem::Requirement
|
150
|
+
requirements:
|
151
|
+
- - "~>"
|
152
|
+
- !ruby/object:Gem::Version
|
153
|
+
version: '0.4'
|
154
|
+
- - ">="
|
155
|
+
- !ruby/object:Gem::Version
|
156
|
+
version: 0.4.1
|
157
|
+
- !ruby/object:Gem::Dependency
|
158
|
+
name: yard
|
159
|
+
requirement: !ruby/object:Gem::Requirement
|
160
|
+
requirements:
|
161
|
+
- - "~>"
|
162
|
+
- !ruby/object:Gem::Version
|
163
|
+
version: 0.9.9
|
164
|
+
type: :development
|
165
|
+
prerelease: false
|
166
|
+
version_requirements: !ruby/object:Gem::Requirement
|
167
|
+
requirements:
|
168
|
+
- - "~>"
|
169
|
+
- !ruby/object:Gem::Version
|
170
|
+
version: 0.9.9
|
125
171
|
description: " A gem with tools for machine learning. "
|
126
172
|
email:
|
127
173
|
- daniel.carpenter01@gmail.com
|
@@ -129,6 +175,7 @@ executables: []
|
|
129
175
|
extensions: []
|
130
176
|
extra_rdoc_files: []
|
131
177
|
files:
|
178
|
+
- ".circleci/config.yml"
|
132
179
|
- ".gitignore"
|
133
180
|
- ".rspec"
|
134
181
|
- ".travis.yml"
|
@@ -140,11 +187,22 @@ files:
|
|
140
187
|
- bin/setup
|
141
188
|
- lib/lurn.rb
|
142
189
|
- lib/lurn/evaluation/classifier_evaluator.rb
|
190
|
+
- lib/lurn/naive_bayes/base.rb
|
143
191
|
- lib/lurn/naive_bayes/bernoulli_naive_bayes.rb
|
192
|
+
- lib/lurn/naive_bayes/multinomial_naive_bayes.rb
|
193
|
+
- lib/lurn/neighbors/knn_base.rb
|
194
|
+
- lib/lurn/neighbors/knn_classifier.rb
|
195
|
+
- lib/lurn/neighbors/knn_regression.rb
|
144
196
|
- lib/lurn/text/bernoulli_vectorizer.rb
|
197
|
+
- lib/lurn/text/word_count_vectorizer.rb
|
145
198
|
- lib/lurn/text/word_tokenizer.rb
|
146
|
-
- lib/lurn/version.rb
|
147
199
|
- lurn.gemspec
|
200
|
+
- readmes/evaluation/classifier_evaluator.md
|
201
|
+
- readmes/naive_bayes/bernoulli_naive_bayes.md
|
202
|
+
- readmes/naive_bayes/multinomial_naive_bayes.md
|
203
|
+
- readmes/neighbors/knn_classification.md
|
204
|
+
- readmes/neighbors/knn_regression.md
|
205
|
+
- readmes/text_processing/bernoulli_vectorizer.md
|
148
206
|
homepage: https://www.github.com/dansbits/lurn
|
149
207
|
licenses:
|
150
208
|
- MIT
|
data/lib/lurn/version.rb
DELETED