lurn 0.1.1 → 0.1.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.circleci/config.yml +46 -0
- data/README.md +11 -32
- data/lib/lurn.rb +6 -2
- data/lib/lurn/naive_bayes/base.rb +32 -0
- data/lib/lurn/naive_bayes/bernoulli_naive_bayes.rb +17 -44
- data/lib/lurn/naive_bayes/multinomial_naive_bayes.rb +65 -0
- data/lib/lurn/neighbors/knn_base.rb +54 -0
- data/lib/lurn/neighbors/knn_classifier.rb +23 -0
- data/lib/lurn/neighbors/knn_regression.rb +20 -0
- data/lib/lurn/text/bernoulli_vectorizer.rb +3 -6
- data/lib/lurn/text/word_count_vectorizer.rb +65 -0
- data/lib/lurn/text/word_tokenizer.rb +7 -2
- data/lurn.gemspec +7 -9
- data/readmes/evaluation/classifier_evaluator.md +21 -0
- data/readmes/naive_bayes/bernoulli_naive_bayes.md +41 -0
- data/readmes/naive_bayes/multinomial_naive_bayes.md +41 -0
- data/readmes/neighbors/knn_classification.md +48 -0
- data/readmes/neighbors/knn_regression.md +48 -0
- data/readmes/text_processing/bernoulli_vectorizer.md +30 -0
- metadata +69 -11
- data/lib/lurn/version.rb +0 -3
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 74be6a1bd3e76e61d34048367f8fafed76c39a46
|
4
|
+
data.tar.gz: 355ea667da4dd95845d00d8ebebc40636d740cab
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 9dad1d2540818efd226bb029aca818b19f5995a3bfbe77392e2577236038f7c84237cb4b19203e0101da1830338d135ea321e27cbecd32664851994c5a100035
|
7
|
+
data.tar.gz: 4defd5fc70dcfbd3389ab184cf59764cf734bc8bda9347070e0c448cb7286ca7d1cd17ca926785191501ca030a4f8f3c74639fa4271a3f6d13cabe88d2a012d0
|
@@ -0,0 +1,46 @@
|
|
1
|
+
# Ruby CircleCI 2.0 configuration file
|
2
|
+
#
|
3
|
+
# Check https://circleci.com/docs/2.0/language-ruby/ for more details
|
4
|
+
#
|
5
|
+
version: 2
|
6
|
+
jobs:
|
7
|
+
build:
|
8
|
+
docker:
|
9
|
+
# specify the version you desire here
|
10
|
+
- image: circleci/ruby:2.4
|
11
|
+
|
12
|
+
# Specify service dependencies here if necessary
|
13
|
+
# CircleCI maintains a library of pre-built images
|
14
|
+
# documented at https://circleci.com/docs/2.0/circleci-images/
|
15
|
+
# - image: circleci/postgres:9.4
|
16
|
+
|
17
|
+
working_directory: ~/repo
|
18
|
+
|
19
|
+
steps:
|
20
|
+
- checkout
|
21
|
+
|
22
|
+
- run:
|
23
|
+
name: install dependencies
|
24
|
+
command: |
|
25
|
+
bundle install --jobs=4 --retry=3 --path vendor/bundle
|
26
|
+
|
27
|
+
# run tests!
|
28
|
+
- run:
|
29
|
+
name: run tests
|
30
|
+
command: |
|
31
|
+
mkdir /tmp/test-results
|
32
|
+
TEST_FILES="$(circleci tests glob "spec/**/*_spec.rb" | circleci tests split --split-by=timings)"
|
33
|
+
|
34
|
+
bundle exec rspec --format progress \
|
35
|
+
--format RspecJunitFormatter \
|
36
|
+
--out /tmp/test-results/rspec.xml \
|
37
|
+
--format progress \
|
38
|
+
-- \
|
39
|
+
$TEST_FILES
|
40
|
+
|
41
|
+
# collect reports
|
42
|
+
- store_test_results:
|
43
|
+
path: /tmp/test-results
|
44
|
+
- store_artifacts:
|
45
|
+
path: /tmp/test-results
|
46
|
+
destination: test-results
|
data/README.md
CHANGED
@@ -1,6 +1,6 @@
|
|
1
1
|
# Lurn
|
2
2
|
|
3
|
-
Lurn is a ruby gem for performing machine learning. The API and design patterns in Lurn are inspired by
|
3
|
+
Lurn is a ruby gem for performing machine learning tasks. The API and design patterns in Lurn are inspired by scikit-learn, a popular machine learning library for Python.
|
4
4
|
|
5
5
|
## Installation
|
6
6
|
|
@@ -20,37 +20,16 @@ Or install it yourself as:
|
|
20
20
|
|
21
21
|
## Usage
|
22
22
|
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
]
|
33
|
-
|
34
|
-
labels = ['computers','sports','computers','sports']
|
35
|
-
|
36
|
-
# vectorizers take raw data and transform it to a set of features that our
|
37
|
-
# model can understand - in this case an array of boolean values representing
|
38
|
-
# the presence or absence of a word in text
|
39
|
-
vectorizer = Lurn::Text::BernoulliVectorizer.new
|
40
|
-
vectorizer.fit(documents)
|
41
|
-
vectors = vectorizer.transform(documents)
|
42
|
-
|
43
|
-
model = Lurn::NaiveBayes::BernoulliNaiveBayes.new
|
44
|
-
model.fit(vectors, labels)
|
45
|
-
|
46
|
-
new_vectors = vectorizer.transform(['programming is fun'])
|
47
|
-
probabilities = model.predict_probabilities(new_vectors.first)
|
48
|
-
# => [0.9715681919147049, 0.028431808085295614]
|
49
|
-
|
50
|
-
# to get the class of the maximum probability, look at the same index of the
|
51
|
-
# unique_labels attribute on the model
|
52
|
-
model.unique_labels[0] # => 'computers'
|
53
|
-
```
|
23
|
+
- Naive Bayes
|
24
|
+
- [Bernoulli Naive Bayes](readmes/naive_bayes/bernoulli_naive_bayes.md)
|
25
|
+
- [Multinomial Naive Bayes](readmes/naive_bayes/multinomial_naive_bayes.md)
|
26
|
+
- Nearest Neighbor Models
|
27
|
+
- [K Nearest Neighbor Regression](readmes/neighbors/knn_regression.md)
|
28
|
+
- [K Nearest Neighbor Classification](readmes/neighbors/knn_classification.md)
|
29
|
+
- Text Processing
|
30
|
+
- [Bernoulli Vectorizer](readmes/text_processing/bernoulli_vectorizer.md)
|
31
|
+
- Model Evaluation
|
32
|
+
- [ClassifierEvaluator](readmes/evaluation/classifier_evaluator.md)
|
54
33
|
|
55
34
|
## Development
|
56
35
|
|
data/lib/lurn.rb
CHANGED
@@ -1,10 +1,14 @@
|
|
1
1
|
require "daru"
|
2
|
-
require "lurn/version"
|
3
2
|
require "lurn/text/word_tokenizer"
|
4
3
|
require "lurn/text/bernoulli_vectorizer"
|
4
|
+
require "lurn/text/word_count_vectorizer"
|
5
|
+
require "lurn/naive_bayes/base"
|
5
6
|
require "lurn/naive_bayes/bernoulli_naive_bayes"
|
7
|
+
require "lurn/naive_bayes/multinomial_naive_bayes"
|
6
8
|
require "lurn/evaluation/classifier_evaluator"
|
9
|
+
require "lurn/neighbors/knn_base"
|
10
|
+
require "lurn/neighbors/knn_regression"
|
11
|
+
require "lurn/neighbors/knn_classifier"
|
7
12
|
|
8
13
|
module Lurn
|
9
|
-
# Your code goes here...
|
10
14
|
end
|
@@ -0,0 +1,32 @@
|
|
1
|
+
module Lurn
|
2
|
+
module NaiveBayes
|
3
|
+
class Base
|
4
|
+
def predict_probabilities(vector)
|
5
|
+
log_probabilties = predict_log_probabilities(vector)
|
6
|
+
|
7
|
+
log_probabilties.map { |p| Math.exp(p) }
|
8
|
+
end
|
9
|
+
|
10
|
+
def max_class(vector)
|
11
|
+
log_probs = predict_log_probabilities(vector)
|
12
|
+
|
13
|
+
max_index = log_probs.index(log_probs.max)
|
14
|
+
|
15
|
+
unique_labels[max_index]
|
16
|
+
end
|
17
|
+
|
18
|
+
def max_probability(vector)
|
19
|
+
probs = predict_probabilities(vector)
|
20
|
+
|
21
|
+
probs.max
|
22
|
+
end
|
23
|
+
|
24
|
+
def predict_log_probabilities(vector)
|
25
|
+
vector = Vector.elements(vector)
|
26
|
+
jll = joint_log_likelihood(vector)
|
27
|
+
log_prob_x = Math.log(jll.map { |v| Math.exp(v) }.inject(:+))
|
28
|
+
jll.map{ |v| v - log_prob_x }
|
29
|
+
end
|
30
|
+
end
|
31
|
+
end
|
32
|
+
end
|
@@ -2,7 +2,7 @@ require 'matrix'
|
|
2
2
|
|
3
3
|
module Lurn
|
4
4
|
module NaiveBayes
|
5
|
-
class BernoulliNaiveBayes
|
5
|
+
class BernoulliNaiveBayes < Base
|
6
6
|
|
7
7
|
attr_accessor :probability_matrix, :label_probabilities, :unique_labels
|
8
8
|
|
@@ -19,38 +19,7 @@ module Lurn
|
|
19
19
|
document_count_matrix = build_document_count_matrix(vectors, labels)
|
20
20
|
@probability_matrix = build_probability_matrix(document_count_matrix, labels)
|
21
21
|
|
22
|
-
@label_probabilities = @unique_labels.map { |l1| labels.
|
23
|
-
end
|
24
|
-
|
25
|
-
def predict_probabilities(vector)
|
26
|
-
log_probabilties = predict_log_probabilities(vector)
|
27
|
-
|
28
|
-
log_probabilties.map { |p| Math.exp(p) }
|
29
|
-
end
|
30
|
-
|
31
|
-
def predict_log_probabilities(vector)
|
32
|
-
|
33
|
-
probabilities = @unique_labels.map do |label|
|
34
|
-
joint_log_likelihood(vector, label)
|
35
|
-
end
|
36
|
-
|
37
|
-
log_prob_x = Math.log(probabilities.map { |v| Math.exp(v) }.sum)
|
38
|
-
|
39
|
-
probabilities.map { |p| p - log_prob_x }
|
40
|
-
end
|
41
|
-
|
42
|
-
def max_class(vector)
|
43
|
-
log_probs = predict_log_probabilities(vector)
|
44
|
-
|
45
|
-
max_index = log_probs.index(log_probs.max)
|
46
|
-
|
47
|
-
unique_labels[max_index]
|
48
|
-
end
|
49
|
-
|
50
|
-
def max_probability(vector)
|
51
|
-
probs = predict_probabilities(vector)
|
52
|
-
|
53
|
-
probs.max
|
22
|
+
@label_probabilities = @unique_labels.map { |l1| labels.count { |l2| l1 == l2 }.to_f / labels.count.to_f }
|
54
23
|
end
|
55
24
|
|
56
25
|
def to_h
|
@@ -64,11 +33,11 @@ module Lurn
|
|
64
33
|
private
|
65
34
|
|
66
35
|
def build_probability_matrix(document_count_matrix, labels)
|
67
|
-
probability_matrix = Array.new(@unique_labels.count) { Array.new(@feature_count
|
36
|
+
probability_matrix = Array.new(@unique_labels.count) { Array.new(@feature_count, 0.0) }
|
68
37
|
|
69
38
|
document_count_matrix.each_with_index do |value, row, col|
|
70
39
|
label = @unique_labels[row]
|
71
|
-
label_frequency = labels.
|
40
|
+
label_frequency = labels.count(label)
|
72
41
|
|
73
42
|
probability_matrix[row][col] = Math.log((value.to_f + @k) / (label_frequency.to_f + (2.0 * @k)))
|
74
43
|
end
|
@@ -77,7 +46,7 @@ module Lurn
|
|
77
46
|
end
|
78
47
|
|
79
48
|
def build_document_count_matrix(vectors, labels)
|
80
|
-
matrix = Array.new(@unique_labels.count) { Array.new(@feature_count
|
49
|
+
matrix = Array.new(@unique_labels.count) { Array.new(@feature_count, 0) }
|
81
50
|
|
82
51
|
vectors.each_with_index do |value, row, col|
|
83
52
|
if value == true
|
@@ -90,16 +59,20 @@ module Lurn
|
|
90
59
|
Matrix.rows(matrix)
|
91
60
|
end
|
92
61
|
|
93
|
-
def joint_log_likelihood(
|
94
|
-
|
62
|
+
def joint_log_likelihood(x)
|
63
|
+
jlls = []
|
95
64
|
|
96
|
-
|
97
|
-
|
98
|
-
|
99
|
-
|
100
|
-
|
65
|
+
unique_labels.each_with_index do |label, label_index|
|
66
|
+
vector = Vector.elements(x.map { |e| e == true ? 1 : 0 })
|
67
|
+
probabilities = @probability_matrix.row(label_index)
|
68
|
+
neg_probs = probabilities.map { |prb| Math.log(1.0 - Math.exp(prb)) }
|
69
|
+
jll = vector.dot(probabilities - neg_probs)
|
70
|
+
jll += Math.log(@label_probabilities[label_index]) + neg_probs.inject(:+)
|
71
|
+
|
72
|
+
jlls.push jll
|
73
|
+
end
|
101
74
|
|
102
|
-
|
75
|
+
jlls
|
103
76
|
end
|
104
77
|
|
105
78
|
end
|
@@ -0,0 +1,65 @@
|
|
1
|
+
module Lurn
|
2
|
+
module NaiveBayes
|
3
|
+
class MultinomialNaiveBayes < Base
|
4
|
+
|
5
|
+
attr_accessor :prior_probabilities, :probability_matrix, :unique_labels
|
6
|
+
|
7
|
+
def initialize
|
8
|
+
|
9
|
+
end
|
10
|
+
|
11
|
+
def fit(vectors, labels)
|
12
|
+
vectors = Matrix.rows(vectors)
|
13
|
+
|
14
|
+
@unique_labels = labels.uniq
|
15
|
+
@feature_count = vectors.column_size
|
16
|
+
count_matrix = build_count_matrix(vectors, labels)
|
17
|
+
@probability_matrix = build_probability_matrix(count_matrix, labels)
|
18
|
+
@prior_probabilities = @unique_labels.map do |l1|
|
19
|
+
labels.count { |l2| l1 == l2 }.to_f / labels.count.to_f
|
20
|
+
end
|
21
|
+
end
|
22
|
+
|
23
|
+
private
|
24
|
+
|
25
|
+
def build_probability_matrix(count_matrix, labels)
|
26
|
+
probability_matrix = Array.new(@unique_labels.count) { Array.new(@feature_count, 0.0) }
|
27
|
+
|
28
|
+
count_matrix.each_with_index do |value, row, col|
|
29
|
+
label = @unique_labels[row]
|
30
|
+
label_frequency = labels.count(label)
|
31
|
+
|
32
|
+
numerator = (value.to_f + 1.0)
|
33
|
+
denominator = count_matrix.row(row).inject(:+) + @feature_count
|
34
|
+
probability_matrix[row][col] = Math.log(numerator / denominator)
|
35
|
+
end
|
36
|
+
|
37
|
+
probability_matrix
|
38
|
+
end
|
39
|
+
|
40
|
+
def build_count_matrix(vectors, labels)
|
41
|
+
matrix = Array.new(@unique_labels.count) { Array.new(@feature_count, 0) }
|
42
|
+
|
43
|
+
vectors.each_with_index do |value, row, col|
|
44
|
+
label = labels[row]
|
45
|
+
label_index = @unique_labels.index(label)
|
46
|
+
matrix[label_index][col] += value
|
47
|
+
end
|
48
|
+
|
49
|
+
Matrix.rows(matrix)
|
50
|
+
end
|
51
|
+
|
52
|
+
def joint_log_likelihood(vector)
|
53
|
+
jlls = []
|
54
|
+
@unique_labels.each_with_index do |label, label_index|
|
55
|
+
probabilities = @probability_matrix[label_index]
|
56
|
+
jll = vector.dot(probabilities)
|
57
|
+
jll += Math.log(@prior_probabilities[label_index])
|
58
|
+
jlls.push(jll)
|
59
|
+
end
|
60
|
+
|
61
|
+
jlls
|
62
|
+
end
|
63
|
+
end
|
64
|
+
end
|
65
|
+
end
|
@@ -0,0 +1,54 @@
|
|
1
|
+
module Lurn
|
2
|
+
module Neighbors
|
3
|
+
class KNNBase
|
4
|
+
|
5
|
+
attr_accessor :predictors, :targets, :k
|
6
|
+
|
7
|
+
def initialize(k)
|
8
|
+
@k = k
|
9
|
+
end
|
10
|
+
|
11
|
+
# Trains the KNN regression model to predict the target variable
|
12
|
+
# based on the predictors. For KNN Regression all computation is
|
13
|
+
# deferred until the time of prediction so in this case the data
|
14
|
+
# is just stored.
|
15
|
+
#
|
16
|
+
# @param predictors [Array-like] An array of arrays containing the predictor data
|
17
|
+
# @param targets [Array-like] An array with the value you want to predict
|
18
|
+
def fit(predictors, targets)
|
19
|
+
@predictors = predictors.map { |pred| Vector.elements(pred) }
|
20
|
+
@targets = targets
|
21
|
+
|
22
|
+
nil
|
23
|
+
end
|
24
|
+
|
25
|
+
# Returns the predictors and target value for the k nearest neighbors for the vector parameter
|
26
|
+
#
|
27
|
+
# @param vector [Array-like] An array of the same length and type as the predictors used to train the model
|
28
|
+
# @return [Array, Array]
|
29
|
+
# Returns two values. The first is an array of the predictors for the k nearest neighbors. The second is an
|
30
|
+
# array of the corresponding target values for the k nearest neighbors.
|
31
|
+
def nearest_neighbors(vector)
|
32
|
+
vector = Vector.elements(vector)
|
33
|
+
|
34
|
+
distances = @predictors.map.with_index do |p, index|
|
35
|
+
{ index: index, distance: euclidian_distance(p, vector), value: targets[index] }
|
36
|
+
end
|
37
|
+
|
38
|
+
distances.sort! { |x,y| x[:distance] <=> y[:distance] }
|
39
|
+
|
40
|
+
neighboring_predictors = distances.first(@k).map { |neighbor| @predictors[neighbor[:index]] }
|
41
|
+
neighboring_targets = distances.first(@k).map { |neighbor| @targets[neighbor[:index]] }
|
42
|
+
|
43
|
+
return neighboring_predictors, neighboring_targets
|
44
|
+
end
|
45
|
+
|
46
|
+
private
|
47
|
+
|
48
|
+
def euclidian_distance(vector1, vector2)
|
49
|
+
Math.sqrt((vector1 - vector2).map { |v| (v.abs)**2 }.inject(:+))
|
50
|
+
end
|
51
|
+
|
52
|
+
end
|
53
|
+
end
|
54
|
+
end
|
@@ -0,0 +1,23 @@
|
|
1
|
+
module Lurn
|
2
|
+
module Neighbors
|
3
|
+
class KNNClassifier < KNNBase
|
4
|
+
|
5
|
+
# Predicts the class of the given observation by selecting the most common class of the
|
6
|
+
# closest k training observations based on euclidian distance. In the case of a tie one winner
|
7
|
+
# will be chosen at random from the most frequent classes.
|
8
|
+
#
|
9
|
+
# @param vector [Array-like]
|
10
|
+
# An array (or array-like) of the same length as the predictors used
|
11
|
+
# to fit the model
|
12
|
+
# @return [Object] The predicted class
|
13
|
+
def predict(vector)
|
14
|
+
_, neighboring_targets = nearest_neighbors(vector)
|
15
|
+
|
16
|
+
class_frequencies = neighboring_targets.inject(Hash.new(0)) { |h,v| h[v] += 1; h }
|
17
|
+
|
18
|
+
neighboring_targets.max_by { |v| class_frequencies[v] }
|
19
|
+
end
|
20
|
+
|
21
|
+
end
|
22
|
+
end
|
23
|
+
end
|
@@ -0,0 +1,20 @@
|
|
1
|
+
module Lurn
|
2
|
+
module Neighbors
|
3
|
+
class KNNRegression < KNNBase
|
4
|
+
|
5
|
+
# Predicts the value of the given observation by averaging the target value of the
|
6
|
+
# closest k predictor observations based on euclidian distance.
|
7
|
+
#
|
8
|
+
# @param vector [Array-like]
|
9
|
+
# An array (or array-like) of the same length as the predictors used
|
10
|
+
# to fit the model
|
11
|
+
# @return [Float] The predicted value
|
12
|
+
def predict(vector)
|
13
|
+
_, neighboring_targets = nearest_neighbors(vector)
|
14
|
+
|
15
|
+
neighboring_targets.inject(:+).to_f / neighboring_targets.length.to_f
|
16
|
+
end
|
17
|
+
|
18
|
+
end
|
19
|
+
end
|
20
|
+
end
|
@@ -17,7 +17,7 @@ module Lurn
|
|
17
17
|
def fit(documents)
|
18
18
|
@vocabulary = []
|
19
19
|
tokenized_docs = tokenize_documents(documents)
|
20
|
-
@vocabulary = tokenized_docs.flatten.uniq.sort
|
20
|
+
@vocabulary = tokenized_docs.flatten(1).uniq.sort
|
21
21
|
reduce_features(tokenized_docs)
|
22
22
|
end
|
23
23
|
|
@@ -49,12 +49,9 @@ module Lurn
|
|
49
49
|
end
|
50
50
|
end
|
51
51
|
|
52
|
-
reduced_features =
|
53
|
-
@vocabulary.each_with_index do |token, index|
|
52
|
+
reduced_features = @vocabulary.select.with_index do |token, index|
|
54
53
|
freq = doc_frequencies[index]
|
55
|
-
|
56
|
-
reduced_features.push token
|
57
|
-
end
|
54
|
+
@options[:min_df] < freq && freq < @options[:max_df]
|
58
55
|
end
|
59
56
|
|
60
57
|
@vocabulary = reduced_features
|
@@ -0,0 +1,65 @@
|
|
1
|
+
module Lurn
|
2
|
+
module Text
|
3
|
+
class WordCountVectorizer
|
4
|
+
|
5
|
+
attr_accessor :tokenizer
|
6
|
+
attr_accessor :vocabulary
|
7
|
+
|
8
|
+
def initialize(options = {})
|
9
|
+
@tokenizer = options[:tokenizer] || WordTokenizer.new
|
10
|
+
@vocabulary = []
|
11
|
+
|
12
|
+
options[:max_df] ||= 50
|
13
|
+
options[:min_df] ||= 0
|
14
|
+
@options = options
|
15
|
+
end
|
16
|
+
|
17
|
+
def fit(documents)
|
18
|
+
@vocabulary = []
|
19
|
+
tokenized_docs = tokenize_documents(documents)
|
20
|
+
@vocabulary = tokenized_docs.flatten(1).uniq.sort
|
21
|
+
reduce_features(tokenized_docs)
|
22
|
+
end
|
23
|
+
|
24
|
+
def to_h
|
25
|
+
{
|
26
|
+
tokenizer_options: @tokenizer.to_h,
|
27
|
+
vocabulary: @vocabulary
|
28
|
+
}
|
29
|
+
end
|
30
|
+
|
31
|
+
def transform(documents)
|
32
|
+
documents.map do |document|
|
33
|
+
tokens = @tokenizer.tokenize(document)
|
34
|
+
@vocabulary.map do |word|
|
35
|
+
tokens.count word
|
36
|
+
end
|
37
|
+
end
|
38
|
+
end
|
39
|
+
|
40
|
+
private
|
41
|
+
|
42
|
+
def reduce_features(tokenized_docs)
|
43
|
+
doc_frequencies = Array.new(@vocabulary.length, 0)
|
44
|
+
|
45
|
+
tokenized_docs.each do |tokens|
|
46
|
+
tokens.each do |token|
|
47
|
+
vocab_index = @vocabulary.index(token)
|
48
|
+
doc_frequencies[vocab_index] += 1
|
49
|
+
end
|
50
|
+
end
|
51
|
+
|
52
|
+
reduced_features = @vocabulary.select.with_index do |token, index|
|
53
|
+
freq = doc_frequencies[index]
|
54
|
+
@options[:min_df] < freq && freq < @options[:max_df]
|
55
|
+
end
|
56
|
+
|
57
|
+
@vocabulary = reduced_features
|
58
|
+
end
|
59
|
+
|
60
|
+
def tokenize_documents(documents)
|
61
|
+
documents.map { |doc| @tokenizer.tokenize(doc).uniq }
|
62
|
+
end
|
63
|
+
end
|
64
|
+
end
|
65
|
+
end
|
@@ -21,17 +21,22 @@ module Lurn
|
|
21
21
|
@options[:strip_punctuation] ||= false
|
22
22
|
@options[:strip_stopwords] ||= false
|
23
23
|
@options[:stem_words] ||= false
|
24
|
+
@options[:ngrams] ||= 1
|
24
25
|
end
|
25
26
|
|
26
27
|
def tokenize(document)
|
27
|
-
document = document.gsub(/[[:punct:]]/,
|
28
|
-
document = document.
|
28
|
+
document = document.gsub(/[[:punct:]]/, "") if @options[:strip_punctuation] == true
|
29
|
+
document = document.split("\s")
|
29
30
|
|
30
31
|
if(@options[:stem_words])
|
31
32
|
stemmer = Lingua::Stemmer.new(language: :en)
|
32
33
|
document = document.map { |word| stemmer.stem(word) }
|
33
34
|
end
|
34
35
|
|
36
|
+
if(@options[:ngrams] > 1)
|
37
|
+
document = document.each_cons(@options[:ngrams]).to_a
|
38
|
+
end
|
39
|
+
|
35
40
|
document
|
36
41
|
end
|
37
42
|
|
data/lurn.gemspec
CHANGED
@@ -1,11 +1,7 @@
|
|
1
1
|
# coding: utf-8
|
2
|
-
lib = File.expand_path('../lib', __FILE__)
|
3
|
-
$LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
|
4
|
-
require 'lurn/version'
|
5
|
-
|
6
2
|
Gem::Specification.new do |spec|
|
7
3
|
spec.name = "lurn"
|
8
|
-
spec.version =
|
4
|
+
spec.version = "0.1.2"
|
9
5
|
spec.authors = ["daniel.carpenter"]
|
10
6
|
spec.email = ["daniel.carpenter01@gmail.com"]
|
11
7
|
|
@@ -21,13 +17,15 @@ Gem::Specification.new do |spec|
|
|
21
17
|
spec.executables = spec.files.grep(%r{^exe/}) { |f| File.basename(f) }
|
22
18
|
spec.require_paths = ["lib"]
|
23
19
|
|
24
|
-
spec.add_dependency "terminal-table", "~> 1.
|
20
|
+
spec.add_dependency "terminal-table", "~> 1.8.0", '>= 1.8.0'
|
25
21
|
spec.add_dependency "ruby-stemmer", "~> 0.9.6"
|
26
|
-
spec.add_dependency "daru",
|
22
|
+
spec.add_dependency "daru", "~> 0.2.1"
|
27
23
|
|
28
24
|
spec.add_development_dependency "bundler", "~> 1.13"
|
29
25
|
spec.add_development_dependency "rake", "~> 10.0"
|
30
26
|
spec.add_development_dependency "rspec", "~> 3.0"
|
31
|
-
spec.add_development_dependency "awesome_print"
|
32
|
-
spec.add_development_dependency "byebug"
|
27
|
+
spec.add_development_dependency "awesome_print", "~> 0"
|
28
|
+
spec.add_development_dependency "byebug", "~> 10.0", ">= 10.0.2"
|
29
|
+
spec.add_development_dependency "rspec_junit_formatter", "~> 0.4", ">= 0.4.1"
|
30
|
+
spec.add_development_dependency "yard", "~> 0.9.9"
|
33
31
|
end
|
@@ -0,0 +1,21 @@
|
|
1
|
+
# Classifier Evaluator
|
2
|
+
`Lurn::Evaluation::ClassifierEvaluator` provides some basic functionality for evaluating the performance of a classifier.
|
3
|
+
|
4
|
+
## Example
|
5
|
+
```
|
6
|
+
actual_class = ['sports','science','science','sports']
|
7
|
+
predicted_class = ['sports','sports','science','sports']
|
8
|
+
|
9
|
+
eval = Lurn::Evaluation::ClassifierEvaluator.new predicted_class, actual_class
|
10
|
+
|
11
|
+
print eval.summary
|
12
|
+
|
13
|
+
# output
|
14
|
+
+-----------------+--------------------+--------+
|
15
|
+
| Class | Precision | Recall |
|
16
|
+
+-----------------+--------------------+--------+
|
17
|
+
| sports | 0.6666666666666666 | 1.0 |
|
18
|
+
| science | 1.0 | 0.5 |
|
19
|
+
| Overall Average | 0.8333333333333333 | 0.75 |
|
20
|
+
+-----------------+--------------------+--------+
|
21
|
+
```
|
@@ -0,0 +1,41 @@
|
|
1
|
+
### Bernoulli Naive Bayes
|
2
|
+
Naive bayes is a bayesian model often used for text classification. Bernoulli Naive Bayes specifically classifies observations based on the presence or absence of a feature in an observation.
|
3
|
+
|
4
|
+
Below is a simple text classification using Naive Bayes in Lurn.
|
5
|
+
|
6
|
+
1. Start with some text documents
|
7
|
+
|
8
|
+
```ruby
|
9
|
+
documents = [
|
10
|
+
'ruby is a great programming language',
|
11
|
+
'the giants recently won the world series',
|
12
|
+
'java is a compiled programming language',
|
13
|
+
'the jets are a football team'
|
14
|
+
]
|
15
|
+
|
16
|
+
labels = ['computers','sports','computers','sports']
|
17
|
+
```
|
18
|
+
|
19
|
+
2. Convert them to arrays of booleans representing which words they contain (or don't contain). Lurn provides vectorizers for this purpose.
|
20
|
+
```
|
21
|
+
vectorizer = Lurn::Text::BernoulliVectorizer.new
|
22
|
+
vectorizer.fit(documents)
|
23
|
+
vectors = vectorizer.transform(documents)
|
24
|
+
```
|
25
|
+
|
26
|
+
3. Initialize and train the model
|
27
|
+
```
|
28
|
+
model = Lurn::NaiveBayes::BernoulliNaiveBayes.new
|
29
|
+
model.fit(vectors, labels)
|
30
|
+
```
|
31
|
+
|
32
|
+
4. Classify a new document
|
33
|
+
```
|
34
|
+
new_vectors = vectorizer.transform(['programming is fun'])
|
35
|
+
|
36
|
+
# get the most probable class for the new document given the training data
|
37
|
+
model.max_class(new_vectors.first)
|
38
|
+
|
39
|
+
# get the probability score for the most probable class
|
40
|
+
model.max_probability(new_vectors.first)
|
41
|
+
```
|
@@ -0,0 +1,41 @@
|
|
1
|
+
### Multinomial Naive Bayes
|
2
|
+
Naive bayes is a bayesian model often used for text classification. Multinomial Naive Bayes specifically classifies observations based on variables with a multinomial distribution (a.k.a. numbers).
|
3
|
+
|
4
|
+
Below is a simple text classification using Multinomial Naive Bayes in Lurn.
|
5
|
+
|
6
|
+
1. Start with some text documents
|
7
|
+
|
8
|
+
```ruby
|
9
|
+
documents = [
|
10
|
+
'ruby is a great programming language',
|
11
|
+
'the giants recently won the world series',
|
12
|
+
'java is a compiled programming language',
|
13
|
+
'the jets are a football team'
|
14
|
+
]
|
15
|
+
|
16
|
+
labels = ['computers','sports','computers','sports']
|
17
|
+
```
|
18
|
+
|
19
|
+
2. Convert them to arrays of booleans representing which words they contain (or don't contain). Lurn provides vectorizers for this purpose.
|
20
|
+
```
|
21
|
+
vectorizer = Lurn::Text::WordCountVectorizer.new
|
22
|
+
vectorizer.fit(documents)
|
23
|
+
vectors = vectorizer.transform(documents)
|
24
|
+
```
|
25
|
+
|
26
|
+
3. Initialize and train the model
|
27
|
+
```
|
28
|
+
model = Lurn::NaiveBayes::MultinomialNaiveBayes.new
|
29
|
+
model.fit(vectors, labels)
|
30
|
+
```
|
31
|
+
|
32
|
+
4. Classify a new document
|
33
|
+
```
|
34
|
+
new_vectors = vectorizer.transform(['programming is fun'])
|
35
|
+
|
36
|
+
# get the most probable class for the new document given the training data
|
37
|
+
model.max_class(new_vectors.first)
|
38
|
+
|
39
|
+
# get the probability score for the most probable class
|
40
|
+
model.max_probability(new_vectors.first)
|
41
|
+
```
|
@@ -0,0 +1,48 @@
|
|
1
|
+
### K Nearest Neighbor Classifier
|
2
|
+
K Nearest Neighbor (KNN) Classification is one of the simplest forms of classification
|
3
|
+
in the machine learning toolbox. Training data is stored on the model and all
|
4
|
+
computation is deferred until the time of prediction. When a new observation
|
5
|
+
is provided it calculates the distance between the new observation and all
|
6
|
+
training data in an n-dimensional space (where n is the number of variables).
|
7
|
+
The predicted class is the most common class among the k closest training records.
|
8
|
+
|
9
|
+
Below is a simple example of using KNN Classification in Lurn.
|
10
|
+
|
11
|
+
Suppose we have a dataset containing the income, years of college eduction and job title
|
12
|
+
for a set of people. We could use this as training data to predict
|
13
|
+
people's job title based on their income and years of eduction.
|
14
|
+
|
15
|
+
```ruby
|
16
|
+
people = [
|
17
|
+
# years of education annual income job title
|
18
|
+
[ 4, 50000, 'engineer'],
|
19
|
+
[ 6, 60000, 'scientist'],
|
20
|
+
[ 2, 40000, 'engineer'],
|
21
|
+
[ 8, 90000, 'scientist'],
|
22
|
+
[ 4, 70000, 'librarian'],
|
23
|
+
]
|
24
|
+
|
25
|
+
# eduction and income
|
26
|
+
predictors = people.map { |person| person[0..1] }
|
27
|
+
|
28
|
+
# extract annual income
|
29
|
+
target_var = people.map { |person| person[2]}
|
30
|
+
```
|
31
|
+
|
32
|
+
The model can be trained by passing the predictors and target values to an initialized
|
33
|
+
instance of the KNNClassifier model.
|
34
|
+
|
35
|
+
```ruby
|
36
|
+
# initialize the model with a k of 2
|
37
|
+
model = Lurn::Neighbors::KNNClassifier.new(2)
|
38
|
+
|
39
|
+
model.fit(predictors, target_var)
|
40
|
+
```
|
41
|
+
|
42
|
+
The model can now be used to predict the income of a person given his/her
|
43
|
+
age and years of education.
|
44
|
+
|
45
|
+
```ruby
|
46
|
+
# predict the job title of person with 4 years of eduction who make $45,000
|
47
|
+
model.predict([4, 45000]) # => engineer
|
48
|
+
```
|
@@ -0,0 +1,48 @@
|
|
1
|
+
### K Nearest Neighbor Regression
|
2
|
+
K Nearest Neighbor (KNN) Regression is one of the simplest forms of regression
|
3
|
+
in the machine learning toolbox. Training data is stored on the model and all
|
4
|
+
computation is deferred until the time of prediction. When a new observation
|
5
|
+
is provided it calculates the distance between the new observation and all
|
6
|
+
training data in an n-dimensional space (where n is the number of variables).
|
7
|
+
The predicted value is the average value of the k closest training records.
|
8
|
+
|
9
|
+
Below is a simple example of using KNN Regression in Lurn.
|
10
|
+
|
11
|
+
Suppose we have a dataset containing the age, years of college eduction and annual
|
12
|
+
income for a set of people. We could use this as training data to predict
|
13
|
+
people's annual income based on their age and years of eduction.
|
14
|
+
|
15
|
+
```ruby
|
16
|
+
people = [
|
17
|
+
# age years of education annual income
|
18
|
+
[ 25, 4, 50000],
|
19
|
+
[ 35, 6, 60000],
|
20
|
+
[ 51, 2, 40000],
|
21
|
+
[ 45, 8, 90000],
|
22
|
+
[ 32, 4, 70000],
|
23
|
+
]
|
24
|
+
|
25
|
+
# extract age and eduction
|
26
|
+
predictors = people.map { |person| person[0..1] }
|
27
|
+
|
28
|
+
# extract annual income
|
29
|
+
target_var = people.map { |person| person[2]}
|
30
|
+
```
|
31
|
+
|
32
|
+
The model can be trained by passing the predictors and target values to an initialized
|
33
|
+
instance of the KNNRegression model.
|
34
|
+
|
35
|
+
```ruby
|
36
|
+
# initialize the model with a k of 2
|
37
|
+
model = Lurn::Neighbors::KNNRegression.new(2)
|
38
|
+
|
39
|
+
model.fit(predictors, target_var)
|
40
|
+
```
|
41
|
+
|
42
|
+
The model can now be used to predict the income of a person given his/her
|
43
|
+
age and years of education.
|
44
|
+
|
45
|
+
```ruby
|
46
|
+
# predict the income of a 31 year old person with 4 years of eduction
|
47
|
+
model.predict([31, 4])
|
48
|
+
```
|
@@ -0,0 +1,30 @@
|
|
1
|
+
# Bernoulli Vectorizer (word presence vectorizer)
|
2
|
+
|
3
|
+
A bernoulli document model is one that represents a piece of text as an array of boolean values. Each boolean represents the presence (true) or absence (false) of a word in the document.
|
4
|
+
|
5
|
+
`Lurn::Text::BernoulliVectorizer` is intended to make it easy to convert text into Bernoulli vectors.
|
6
|
+
|
7
|
+
## Basic example
|
8
|
+
```
|
9
|
+
docs = ['hello world', 'hello fred']
|
10
|
+
|
11
|
+
vectorizer = Lurn::Text::BernoulliVectorizer.new
|
12
|
+
|
13
|
+
# vectorizers must be trained in order to know
|
14
|
+
# what features (words) exist in the data set
|
15
|
+
vectorizer.fit(docs)
|
16
|
+
|
17
|
+
vectorizer.transform(docs)
|
18
|
+
```
|
19
|
+
|
20
|
+
## Configuration
|
21
|
+
The BernoulliVectorizer.new includes a number of options for configuring how documents are vectorized. A few include:
|
22
|
+
- max_df[int]: Excludes words which appear in more than `max_df` documents
|
23
|
+
- min_df[int]: Excludes words which appear in fewer than `min_df` documents
|
24
|
+
- strip_stopwords[boolean]: Removes stop words if true
|
25
|
+
- stem_words[boolean]: Stems words in the documents if true
|
26
|
+
- ngrams[int]: Features will be determined based on groupings of `ngrams` consecutive words instead of individual words
|
27
|
+
|
28
|
+
```
|
29
|
+
Lurn::BernoulliVectorizer.new(strip_stopwords: true, min_df: 10)
|
30
|
+
```
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: lurn
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.1.
|
4
|
+
version: 0.1.2
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- daniel.carpenter
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date:
|
11
|
+
date: 2018-08-17 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: terminal-table
|
@@ -16,14 +16,20 @@ dependencies:
|
|
16
16
|
requirements:
|
17
17
|
- - "~>"
|
18
18
|
- !ruby/object:Gem::Version
|
19
|
-
version: 1.
|
19
|
+
version: 1.8.0
|
20
|
+
- - ">="
|
21
|
+
- !ruby/object:Gem::Version
|
22
|
+
version: 1.8.0
|
20
23
|
type: :runtime
|
21
24
|
prerelease: false
|
22
25
|
version_requirements: !ruby/object:Gem::Requirement
|
23
26
|
requirements:
|
24
27
|
- - "~>"
|
25
28
|
- !ruby/object:Gem::Version
|
26
|
-
version: 1.
|
29
|
+
version: 1.8.0
|
30
|
+
- - ">="
|
31
|
+
- !ruby/object:Gem::Version
|
32
|
+
version: 1.8.0
|
27
33
|
- !ruby/object:Gem::Dependency
|
28
34
|
name: ruby-stemmer
|
29
35
|
requirement: !ruby/object:Gem::Requirement
|
@@ -44,14 +50,14 @@ dependencies:
|
|
44
50
|
requirements:
|
45
51
|
- - "~>"
|
46
52
|
- !ruby/object:Gem::Version
|
47
|
-
version: 0.1
|
53
|
+
version: 0.2.1
|
48
54
|
type: :runtime
|
49
55
|
prerelease: false
|
50
56
|
version_requirements: !ruby/object:Gem::Requirement
|
51
57
|
requirements:
|
52
58
|
- - "~>"
|
53
59
|
- !ruby/object:Gem::Version
|
54
|
-
version: 0.1
|
60
|
+
version: 0.2.1
|
55
61
|
- !ruby/object:Gem::Dependency
|
56
62
|
name: bundler
|
57
63
|
requirement: !ruby/object:Gem::Requirement
|
@@ -98,30 +104,70 @@ dependencies:
|
|
98
104
|
name: awesome_print
|
99
105
|
requirement: !ruby/object:Gem::Requirement
|
100
106
|
requirements:
|
101
|
-
- - "
|
107
|
+
- - "~>"
|
102
108
|
- !ruby/object:Gem::Version
|
103
109
|
version: '0'
|
104
110
|
type: :development
|
105
111
|
prerelease: false
|
106
112
|
version_requirements: !ruby/object:Gem::Requirement
|
107
113
|
requirements:
|
108
|
-
- - "
|
114
|
+
- - "~>"
|
109
115
|
- !ruby/object:Gem::Version
|
110
116
|
version: '0'
|
111
117
|
- !ruby/object:Gem::Dependency
|
112
118
|
name: byebug
|
113
119
|
requirement: !ruby/object:Gem::Requirement
|
114
120
|
requirements:
|
121
|
+
- - "~>"
|
122
|
+
- !ruby/object:Gem::Version
|
123
|
+
version: '10.0'
|
115
124
|
- - ">="
|
116
125
|
- !ruby/object:Gem::Version
|
117
|
-
version:
|
126
|
+
version: 10.0.2
|
118
127
|
type: :development
|
119
128
|
prerelease: false
|
120
129
|
version_requirements: !ruby/object:Gem::Requirement
|
121
130
|
requirements:
|
131
|
+
- - "~>"
|
132
|
+
- !ruby/object:Gem::Version
|
133
|
+
version: '10.0'
|
122
134
|
- - ">="
|
123
135
|
- !ruby/object:Gem::Version
|
124
|
-
version:
|
136
|
+
version: 10.0.2
|
137
|
+
- !ruby/object:Gem::Dependency
|
138
|
+
name: rspec_junit_formatter
|
139
|
+
requirement: !ruby/object:Gem::Requirement
|
140
|
+
requirements:
|
141
|
+
- - "~>"
|
142
|
+
- !ruby/object:Gem::Version
|
143
|
+
version: '0.4'
|
144
|
+
- - ">="
|
145
|
+
- !ruby/object:Gem::Version
|
146
|
+
version: 0.4.1
|
147
|
+
type: :development
|
148
|
+
prerelease: false
|
149
|
+
version_requirements: !ruby/object:Gem::Requirement
|
150
|
+
requirements:
|
151
|
+
- - "~>"
|
152
|
+
- !ruby/object:Gem::Version
|
153
|
+
version: '0.4'
|
154
|
+
- - ">="
|
155
|
+
- !ruby/object:Gem::Version
|
156
|
+
version: 0.4.1
|
157
|
+
- !ruby/object:Gem::Dependency
|
158
|
+
name: yard
|
159
|
+
requirement: !ruby/object:Gem::Requirement
|
160
|
+
requirements:
|
161
|
+
- - "~>"
|
162
|
+
- !ruby/object:Gem::Version
|
163
|
+
version: 0.9.9
|
164
|
+
type: :development
|
165
|
+
prerelease: false
|
166
|
+
version_requirements: !ruby/object:Gem::Requirement
|
167
|
+
requirements:
|
168
|
+
- - "~>"
|
169
|
+
- !ruby/object:Gem::Version
|
170
|
+
version: 0.9.9
|
125
171
|
description: " A gem with tools for machine learning. "
|
126
172
|
email:
|
127
173
|
- daniel.carpenter01@gmail.com
|
@@ -129,6 +175,7 @@ executables: []
|
|
129
175
|
extensions: []
|
130
176
|
extra_rdoc_files: []
|
131
177
|
files:
|
178
|
+
- ".circleci/config.yml"
|
132
179
|
- ".gitignore"
|
133
180
|
- ".rspec"
|
134
181
|
- ".travis.yml"
|
@@ -140,11 +187,22 @@ files:
|
|
140
187
|
- bin/setup
|
141
188
|
- lib/lurn.rb
|
142
189
|
- lib/lurn/evaluation/classifier_evaluator.rb
|
190
|
+
- lib/lurn/naive_bayes/base.rb
|
143
191
|
- lib/lurn/naive_bayes/bernoulli_naive_bayes.rb
|
192
|
+
- lib/lurn/naive_bayes/multinomial_naive_bayes.rb
|
193
|
+
- lib/lurn/neighbors/knn_base.rb
|
194
|
+
- lib/lurn/neighbors/knn_classifier.rb
|
195
|
+
- lib/lurn/neighbors/knn_regression.rb
|
144
196
|
- lib/lurn/text/bernoulli_vectorizer.rb
|
197
|
+
- lib/lurn/text/word_count_vectorizer.rb
|
145
198
|
- lib/lurn/text/word_tokenizer.rb
|
146
|
-
- lib/lurn/version.rb
|
147
199
|
- lurn.gemspec
|
200
|
+
- readmes/evaluation/classifier_evaluator.md
|
201
|
+
- readmes/naive_bayes/bernoulli_naive_bayes.md
|
202
|
+
- readmes/naive_bayes/multinomial_naive_bayes.md
|
203
|
+
- readmes/neighbors/knn_classification.md
|
204
|
+
- readmes/neighbors/knn_regression.md
|
205
|
+
- readmes/text_processing/bernoulli_vectorizer.md
|
148
206
|
homepage: https://www.github.com/dansbits/lurn
|
149
207
|
licenses:
|
150
208
|
- MIT
|
data/lib/lurn/version.rb
DELETED