idhja22 0.14.4 → 1.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/.gitignore +1 -1
- data/bin/idhja22 +12 -3
- data/idhja22.gemspec +2 -2
- data/lib/idhja22/bayes.rb +50 -2
- data/lib/idhja22/binary_classifier.rb +54 -0
- data/lib/idhja22/config/default.rb +1 -0
- data/lib/idhja22/dataset/errors.rb +1 -0
- data/lib/idhja22/dataset.rb +22 -5
- data/lib/idhja22/tree/node.rb +57 -14
- data/lib/idhja22/tree.rb +3 -45
- data/lib/idhja22/version.rb +1 -1
- data/lib/idhja22.rb +1 -0
- data/spec/bayes_spec.rb +114 -0
- data/spec/{another_large_spec_data.csv → data/another_large_spec_data.csv} +0 -0
- data/spec/data/evenly_split.csv +9 -0
- data/spec/{large_spec_data.csv → data/large_spec_data.csv} +0 -0
- data/spec/{spec_data.csv → data/spec_data.csv} +0 -0
- data/spec/dataset_spec.rb +49 -2
- data/spec/node/decision_node_spec.rb +205 -0
- data/spec/node/leaf_node_spec.rb +53 -0
- data/spec/spec_helper.rb +4 -0
- data/spec/tree_spec.rb +4 -4
- data/spec/version_spec.rb +1 -1
- metadata +19 -14
- data/spec/node_spec.rb +0 -97
data/.gitignore
CHANGED
data/bin/idhja22
CHANGED
@@ -4,13 +4,22 @@ require 'thor'
|
|
4
4
|
require 'idhja22'
|
5
5
|
|
6
6
|
class TrainAndValidate < Thor
|
7
|
-
desc "
|
7
|
+
desc "train_and_validate_tree FILE", "train a tree for the given file and validate is against a validation set"
|
8
|
+
method_option :attributes, :type => :array
|
8
9
|
method_option :"training-proportion", :type => :numeric, :default => 1.0, :aliases => 't'
|
9
|
-
def
|
10
|
-
t, v = Idhja22::Tree.train_and_validate_from_csv(filename, options
|
10
|
+
def train_and_validate_tree(filename)
|
11
|
+
t, v = Idhja22::Tree.train_and_validate_from_csv(filename, options)
|
11
12
|
puts t.get_rules
|
12
13
|
puts "Against validation set probability of successful classifiction: #{v}" if options[:"training-proportion"] < 1.0
|
13
14
|
end
|
15
|
+
|
16
|
+
desc "train_and_validate_bayes FILE", "train a naive Bayesian classifier for the given file and validate is against a validation set"
|
17
|
+
method_option :attributes, :type => :array
|
18
|
+
method_option :"training-proportion", :type => :numeric, :default => 1.0, :aliases => 't'
|
19
|
+
def train_and_validate_bayes(filename)
|
20
|
+
t, v = Idhja22::Bayes.train_and_validate_from_csv(filename, options)
|
21
|
+
puts "Against validation set probability of successful classifiction: #{v}" if options[:"training-proportion"] < 1.0
|
22
|
+
end
|
14
23
|
end
|
15
24
|
|
16
25
|
TrainAndValidate.start
|
data/idhja22.gemspec
CHANGED
@@ -7,8 +7,8 @@ Gem::Specification.new do |gem|
|
|
7
7
|
gem.name = "idhja22"
|
8
8
|
gem.version = Idhja22::VERSION
|
9
9
|
gem.authors = ["Henry Addison"]
|
10
|
-
gem.description = %q{
|
11
|
-
gem.summary = %q{A gem for creating decision trees}
|
10
|
+
gem.description = %q{Classifiers}
|
11
|
+
gem.summary = %q{A gem for creating classifiers (decision trees and naive Bayes so far)}
|
12
12
|
gem.homepage = "https://github.com/henryaddison/idhja22"
|
13
13
|
|
14
14
|
gem.files = `git ls-files`.split($/)
|
data/lib/idhja22/bayes.rb
CHANGED
@@ -1,5 +1,53 @@
|
|
1
1
|
module Idhja22
|
2
|
-
class Bayes
|
3
|
-
|
2
|
+
class Bayes < BinaryClassifier
|
3
|
+
attr_accessor :conditional_probabilities, :prior_probabilities
|
4
|
+
class << self
|
5
|
+
def calculate_conditional_probabilities dataset, attribute_labels_to_use
|
6
|
+
conditional_probabilities = {}
|
7
|
+
attribute_labels_to_use.each do |attr_label|
|
8
|
+
conditional_probabilities[attr_label] = {}
|
9
|
+
dataset.partition_by_category.each do |cat, uniform_category_ds|
|
10
|
+
conditional_probabilities[attr_label][cat] = Hash.new(0)
|
11
|
+
partitioned_data = uniform_category_ds.partition(attr_label)
|
12
|
+
partitioned_data.each do |attr_value, uniform_value_ds|
|
13
|
+
conditional_probabilities[attr_label][cat][attr_value] = uniform_value_ds.size.to_f/uniform_category_ds.size.to_f
|
14
|
+
end
|
15
|
+
end
|
16
|
+
end
|
17
|
+
|
18
|
+
return conditional_probabilities
|
19
|
+
end
|
20
|
+
|
21
|
+
def calculate_priors dataset
|
22
|
+
output = Hash.new(0)
|
23
|
+
dataset.category_counts.each do |cat, count|
|
24
|
+
output[cat] = count.to_f/dataset.size.to_f
|
25
|
+
end
|
26
|
+
return output
|
27
|
+
end
|
28
|
+
end
|
29
|
+
|
30
|
+
def evaluate(query)
|
31
|
+
nb_values = {}
|
32
|
+
total_values = 0
|
33
|
+
|
34
|
+
prior_probabilities.each do |cat, prior_prob|
|
35
|
+
nb_value = prior_prob
|
36
|
+
conditional_probabilities.each do |attr_label, probs|
|
37
|
+
raise Idhja22::Dataset::Datum::UnknownAttributeValue, "Not seen value #{query[attr_label]} for attribute #{attr_label} in training." unless probs[cat].has_key? query[attr_label]
|
38
|
+
nb_value *= probs[cat][query[attr_label]]
|
39
|
+
end
|
40
|
+
total_values += nb_value
|
41
|
+
nb_values[cat] = nb_value
|
42
|
+
end
|
43
|
+
|
44
|
+
return nb_values['Y']/total_values
|
45
|
+
end
|
46
|
+
|
47
|
+
def train(dataset, attributes_to_use)
|
48
|
+
self.conditional_probabilities = self.class.calculate_conditional_probabilities(dataset, attributes_to_use)
|
49
|
+
self.prior_probabilities = self.class.calculate_priors(dataset)
|
50
|
+
return self
|
51
|
+
end
|
4
52
|
end
|
5
53
|
end
|
@@ -0,0 +1,54 @@
|
|
1
|
+
module Idhja22
|
2
|
+
class BinaryClassifier
|
3
|
+
|
4
|
+
class << self
|
5
|
+
# Trains a classifier using the provided Dataset.
|
6
|
+
def train(dataset, opts = {})
|
7
|
+
attributes_to_use = (opts[:attributes] || dataset.attribute_labels)
|
8
|
+
classifier = new
|
9
|
+
classifier.train(dataset, attributes_to_use)
|
10
|
+
return classifier
|
11
|
+
end
|
12
|
+
|
13
|
+
# Takes a dataset and splits it randomly into training and validation data.
|
14
|
+
# Uses the training data to train a classifier whose perfomance then measured using the validation data.
|
15
|
+
# @param [Float] Proportion of dataset to use for training. The rest will be used to validate the resulting classifier.
|
16
|
+
def train_and_validate(dataset, opts = {})
|
17
|
+
opts[:"training-proportion"] ||= 0.5
|
18
|
+
training_set, validation_set = dataset.split(opts[:"training-proportion"])
|
19
|
+
tree = self.train(training_set, opts)
|
20
|
+
validation_value = tree.validate(validation_set)
|
21
|
+
return tree, validation_value
|
22
|
+
end
|
23
|
+
|
24
|
+
# see #train
|
25
|
+
# @note Takes a CSV filename rather than a Dataset
|
26
|
+
def train_from_csv(filename, opts={})
|
27
|
+
ds = Dataset.from_csv(filename)
|
28
|
+
train(ds, opts)
|
29
|
+
end
|
30
|
+
|
31
|
+
# see #train_and_validate
|
32
|
+
# @note Takes a CSV filename rather than a Dataset
|
33
|
+
def train_and_validate_from_csv(filename, opts={})
|
34
|
+
ds = Dataset.from_csv(filename)
|
35
|
+
train_and_validate(ds, opts)
|
36
|
+
end
|
37
|
+
end
|
38
|
+
|
39
|
+
def validate(ds)
|
40
|
+
output = 0
|
41
|
+
ds.data.each do |validation_point|
|
42
|
+
begin
|
43
|
+
prob = evaluate(validation_point)
|
44
|
+
output += (validation_point.category == 'Y' ? prob : 1.0 - prob)
|
45
|
+
rescue Idhja22::Dataset::Datum::UnknownAttributeValue
|
46
|
+
# if don't recognised the attribute value in the example, then assume the worst:
|
47
|
+
# will never classify this point correctly
|
48
|
+
# equivalent to output += 0 but no point running this
|
49
|
+
end
|
50
|
+
end
|
51
|
+
return output.to_f/ds.size.to_f
|
52
|
+
end
|
53
|
+
end
|
54
|
+
end
|
data/lib/idhja22/dataset.rb
CHANGED
@@ -17,13 +17,13 @@ module Idhja22
|
|
17
17
|
category_label = labels.pop
|
18
18
|
attribute_labels = labels
|
19
19
|
|
20
|
-
|
20
|
+
set = new([], attribute_labels, category_label)
|
21
21
|
csv.each do |row|
|
22
22
|
training_example = Example.new(row, attribute_labels, category_label)
|
23
|
-
|
23
|
+
set << training_example
|
24
24
|
end
|
25
25
|
|
26
|
-
|
26
|
+
return set
|
27
27
|
end
|
28
28
|
end
|
29
29
|
|
@@ -36,8 +36,9 @@ module Idhja22
|
|
36
36
|
|
37
37
|
def category_counts
|
38
38
|
counts = Hash.new(0)
|
39
|
-
|
40
|
-
|
39
|
+
split_data = partition_by_category
|
40
|
+
split_data.each do |cat, d|
|
41
|
+
counts[cat] = d.size
|
41
42
|
end
|
42
43
|
return counts
|
43
44
|
end
|
@@ -66,5 +67,21 @@ module Idhja22
|
|
66
67
|
|
67
68
|
return training_set, validation_set
|
68
69
|
end
|
70
|
+
|
71
|
+
def <<(example)
|
72
|
+
raise Idhja22::Dataset::Datum::UnknownCategoryLabel unless example.category_label == self.category_label
|
73
|
+
raise Idhja22::Dataset::Datum::UnknownAttributeLabel unless example.attribute_labels == self.attribute_labels
|
74
|
+
self.data << example
|
75
|
+
end
|
76
|
+
|
77
|
+
def partition_by_category
|
78
|
+
output = Hash.new do |hash, key|
|
79
|
+
hash[key] = self.class.new([], attribute_labels, category_label)
|
80
|
+
end
|
81
|
+
self.data.each do |d|
|
82
|
+
output[d.category] << d
|
83
|
+
end
|
84
|
+
return output
|
85
|
+
end
|
69
86
|
end
|
70
87
|
end
|
data/lib/idhja22/tree/node.rb
CHANGED
@@ -20,9 +20,7 @@ module Idhja22
|
|
20
20
|
return Idhja22::LeafNode.new(dataset.probability, dataset.category_label)
|
21
21
|
end
|
22
22
|
|
23
|
-
|
24
|
-
|
25
|
-
node = Idhja22::DecisionNode.new(data_split, best_attribute, attributes_available-[best_attribute], depth, dataset.probability)
|
23
|
+
node = DecisionNode.build(dataset, attributes_available, depth)
|
26
24
|
|
27
25
|
return node
|
28
26
|
end
|
@@ -59,21 +57,34 @@ module Idhja22
|
|
59
57
|
|
60
58
|
class DecisionNode < Node
|
61
59
|
attr_reader :branches, :decision_attribute
|
62
|
-
|
63
|
-
|
64
|
-
|
65
|
-
|
66
|
-
|
67
|
-
|
68
|
-
|
69
|
-
|
70
|
-
|
71
|
-
|
60
|
+
|
61
|
+
class << self
|
62
|
+
def build(dataset, attributes_available, depth)
|
63
|
+
data_split, best_attribute = best_attribute(dataset, attributes_available)
|
64
|
+
|
65
|
+
output_node = new(best_attribute)
|
66
|
+
|
67
|
+
data_split.each do |value, dataset|
|
68
|
+
node = Node.build_node(dataset, attributes_available-[best_attribute], depth+1, dataset.probability)
|
69
|
+
|
70
|
+
output_node.add_branch(value, node) if node && !(node.is_a?(DecisionNode) && node.branches.empty?)
|
72
71
|
end
|
73
|
-
|
72
|
+
|
73
|
+
output_node.cleanup_children!
|
74
|
+
|
75
|
+
return output_node
|
74
76
|
end
|
75
77
|
end
|
76
78
|
|
79
|
+
def initialize(decision_attribute)
|
80
|
+
@decision_attribute = decision_attribute
|
81
|
+
@branches = {}
|
82
|
+
end
|
83
|
+
|
84
|
+
def add_branch(attr_value, node)
|
85
|
+
@branches[attr_value] = node
|
86
|
+
end
|
87
|
+
|
77
88
|
def get_rules
|
78
89
|
rules = []
|
79
90
|
branches.each do |v,n|
|
@@ -104,6 +115,29 @@ module Idhja22
|
|
104
115
|
raise Idhja22::Dataset::Datum::UnknownAttributeValue, "when looking at attribute labelled #{self.decision_attribute} could not find branch for value #{queried_value}" if branch.nil?
|
105
116
|
branch.evaluate(query)
|
106
117
|
end
|
118
|
+
|
119
|
+
def cleanup_children!
|
120
|
+
branches.each do |attr, child_node|
|
121
|
+
child_node.cleanup_children!
|
122
|
+
leaves = child_node.leaves
|
123
|
+
probs = leaves.collect(&:probability)
|
124
|
+
if(probs.max - probs.min < Idhja22.config.probability_delta)
|
125
|
+
new_node = LeafNode.new(probs.max, category_label)
|
126
|
+
add_branch(attr, new_node)
|
127
|
+
end
|
128
|
+
end
|
129
|
+
end
|
130
|
+
|
131
|
+
def leaves
|
132
|
+
raise Idhja22::IncompleteTree, "decision node with no branches" if branches.empty?
|
133
|
+
branches.values.flat_map do |child_node|
|
134
|
+
child_node.leaves
|
135
|
+
end
|
136
|
+
end
|
137
|
+
|
138
|
+
def category_label
|
139
|
+
leaves.first.category_label
|
140
|
+
end
|
107
141
|
end
|
108
142
|
|
109
143
|
class LeafNode < Node
|
@@ -125,5 +159,14 @@ module Idhja22
|
|
125
159
|
raise Idhja22::Dataset::Datum::UnknownCategoryLabel, "expected category label for query is #{query.category_label} but node is using #{self.category_label}" unless query.category_label == self.category_label
|
126
160
|
return probability
|
127
161
|
end
|
162
|
+
|
163
|
+
def leaves
|
164
|
+
return [self]
|
165
|
+
end
|
166
|
+
|
167
|
+
# no-op method - a leaf node has no children by definition
|
168
|
+
def cleanup_children!
|
169
|
+
|
170
|
+
end
|
128
171
|
end
|
129
172
|
end
|
data/lib/idhja22/tree.rb
CHANGED
@@ -2,42 +2,15 @@ require "idhja22/tree/node"
|
|
2
2
|
|
3
3
|
module Idhja22
|
4
4
|
# The main entry class for a training, viewing and evaluating a decision tree.
|
5
|
-
class Tree
|
5
|
+
class Tree < BinaryClassifier
|
6
6
|
attr_accessor :root
|
7
7
|
class << self
|
8
|
-
# Trains a Tree using the provided Dataset.
|
9
|
-
def train(dataset)
|
10
|
-
new(dataset, dataset.attribute_labels)
|
11
|
-
end
|
12
|
-
|
13
|
-
# Takes a dataset and splits it randomly into training and validation data.
|
14
|
-
# Uses the training data to train a tree whose perfomance then measured using the validation data.
|
15
|
-
# @param [Float] Proportion of dataset to use for training. The rest will be used to validate the resulting tree.
|
16
|
-
def train_and_validate(dataset, training_proportion=0.5)
|
17
|
-
training_set, validation_set = dataset.split(training_proportion)
|
18
|
-
tree = self.train(training_set)
|
19
|
-
validation_value = tree.validate(validation_set)
|
20
|
-
return tree, validation_value
|
21
|
-
end
|
22
|
-
|
23
|
-
# see #train
|
24
|
-
# @note Takes a CSV filename rather than a Dataset
|
25
|
-
def train_from_csv(filename)
|
26
|
-
ds = Dataset.from_csv(filename)
|
27
|
-
train(ds)
|
28
|
-
end
|
29
|
-
|
30
|
-
# see #train_and_validate
|
31
|
-
# @note Takes a CSV filename rather than a Dataset
|
32
|
-
def train_and_validate_from_csv(filename, training_proportion=0.5)
|
33
|
-
ds = Dataset.from_csv(filename)
|
34
|
-
train_and_validate(ds, training_proportion)
|
35
|
-
end
|
36
8
|
end
|
37
9
|
|
38
|
-
def
|
10
|
+
def train(dataset, attributes_available)
|
39
11
|
raise Idhja22::Dataset::InsufficientData, "require at least #{Idhja22.config.min_dataset_size} data points, only have #{dataset.size} in data set provided" if(dataset.size < Idhja22.config.min_dataset_size)
|
40
12
|
@root = Node.build_node(dataset, attributes_available, 0)
|
13
|
+
return self
|
41
14
|
end
|
42
15
|
|
43
16
|
def get_rules
|
@@ -52,20 +25,5 @@ module Idhja22
|
|
52
25
|
def evaluate query
|
53
26
|
@root.evaluate(query)
|
54
27
|
end
|
55
|
-
|
56
|
-
def validate(ds)
|
57
|
-
output = 0
|
58
|
-
ds.data.each do |validation_point|
|
59
|
-
begin
|
60
|
-
prob = evaluate(validation_point)
|
61
|
-
output += (validation_point.category == 'Y' ? prob : 1.0 - prob)
|
62
|
-
rescue Idhja22::Dataset::Datum::UnknownAttributeValue
|
63
|
-
# if don't recognised the attribute value in the example, then assume the worst:
|
64
|
-
# will never classify this point correctly
|
65
|
-
# equivalent to output += 0 but no point running this
|
66
|
-
end
|
67
|
-
end
|
68
|
-
return output.to_f/ds.size.to_f
|
69
|
-
end
|
70
28
|
end
|
71
29
|
end
|
data/lib/idhja22/version.rb
CHANGED
data/lib/idhja22.rb
CHANGED
data/spec/bayes_spec.rb
CHANGED
@@ -1,5 +1,119 @@
|
|
1
1
|
require 'spec_helper'
|
2
2
|
|
3
3
|
describe Idhja22::Bayes do
|
4
|
+
before(:all) do
|
5
|
+
@ds = Idhja22::Dataset.from_csv(File.join(data_dir,'large_spec_data.csv'))
|
6
|
+
end
|
4
7
|
|
8
|
+
describe '.train' do
|
9
|
+
it 'should train a classifier from a dataset' do
|
10
|
+
classifier = Idhja22::Bayes.train @ds, :attributes => %w{0}
|
11
|
+
cond_probs = classifier.conditional_probabilities
|
12
|
+
cond_probs.keys.should == ['0']
|
13
|
+
|
14
|
+
cond_probs['0'].keys.should == ['Y', 'N']
|
15
|
+
|
16
|
+
cond_probs['0']['Y']['a'].should == 5.0/6.0
|
17
|
+
cond_probs['0']['N']['a'].should == 0.75
|
18
|
+
|
19
|
+
cond_probs['0']['Y']['b'].should == 1.0/6.0
|
20
|
+
cond_probs['0']['N']['b'].should == 0.25
|
21
|
+
|
22
|
+
prior_probs = classifier.prior_probabilities
|
23
|
+
prior_probs['Y'].should == 0.6
|
24
|
+
prior_probs['N'].should == 0.4
|
25
|
+
end
|
26
|
+
end
|
27
|
+
|
28
|
+
describe '.calculate_conditional_probabilities' do
|
29
|
+
it 'should calculate the conditional probabilities of P(Cat|attr_val) from dataset for given attribute labels' do
|
30
|
+
cond_probs = Idhja22::Bayes.calculate_conditional_probabilities @ds, %w{0 2}
|
31
|
+
cond_probs.keys.should == ['0', '2']
|
32
|
+
cond_probs['0'].keys.should == ['Y','N']
|
33
|
+
cond_probs['2'].keys.should == ['Y','N']
|
34
|
+
|
35
|
+
cond_probs['0']['Y']['a'].should == 5.0/6.0
|
36
|
+
cond_probs['0']['N']['a'].should == 0.75
|
37
|
+
cond_probs['0']['Y']['b'].should == 1.0/6.0
|
38
|
+
cond_probs['0']['N']['b'].should == 0.25
|
39
|
+
|
40
|
+
cond_probs['2']['Y']['a'].should == 1.0
|
41
|
+
cond_probs['2']['N']['a'].should == 0.5
|
42
|
+
cond_probs['2']['Y']['b'].should == 0
|
43
|
+
cond_probs['2']['N']['b'].should == 0.5
|
44
|
+
end
|
45
|
+
end
|
46
|
+
|
47
|
+
describe '.calculate_priors' do
|
48
|
+
it 'should calculate the prior probabilities' do
|
49
|
+
prior_probs = Idhja22::Bayes.calculate_priors @ds
|
50
|
+
prior_probs['Y'].should == 0.6
|
51
|
+
prior_probs['N'].should == 0.4
|
52
|
+
end
|
53
|
+
|
54
|
+
context 'all single category' do
|
55
|
+
it 'should return 0 for other categories' do
|
56
|
+
uniform_ds = Idhja22::Dataset.new([Idhja22::Dataset::Example.new(['high', '20-30', 'vanilla', 'Y'], ['Confidence', 'Age group', 'fav ice cream'] , 'Loves Reading')], ['Confidence', 'Age group', 'fav ice cream'], 'Loves Reading')
|
57
|
+
prior_probs = Idhja22::Bayes.calculate_priors uniform_ds
|
58
|
+
prior_probs['Y'].should == 1.0
|
59
|
+
prior_probs['N'].should == 0
|
60
|
+
end
|
61
|
+
end
|
62
|
+
end
|
63
|
+
|
64
|
+
describe '#evaluate' do
|
65
|
+
before(:all) do
|
66
|
+
@bayes = Idhja22::Bayes.new
|
67
|
+
@bayes.conditional_probabilities = {
|
68
|
+
'age' => {
|
69
|
+
'Y' => {'young' => 0.98, 'old' => 0.02},
|
70
|
+
'N' => {'young' => 0.98, 'old' => 0.02}
|
71
|
+
|
72
|
+
},
|
73
|
+
'confidence' => {
|
74
|
+
'Y' => {'high' => 0.6, 'medium' => 0.3, 'low' => 0.1},
|
75
|
+
'N' => {'high' => 0.8, 'medium' => 0.15, 'low' => 0.05}
|
76
|
+
},
|
77
|
+
'fav ice cream' => {
|
78
|
+
'Y' => {'vanilla' => 0.75, 'strawberry' => 0.25},
|
79
|
+
'N' => {'vanilla' => 0.5, 'strawberry' => 0.6}
|
80
|
+
}
|
81
|
+
}
|
82
|
+
@bayes.prior_probabilities = {'Y' => 0.75, 'N' => 0.25}
|
83
|
+
end
|
84
|
+
|
85
|
+
context 'Y likely' do
|
86
|
+
it 'should return probability of being Y' do
|
87
|
+
query = Idhja22::Dataset::Datum.new(['high', 'young', 'vanilla', 'cheddar'], ['confidence', 'age', 'fav ice cream', 'fav cheese'], 'Loves Reading')
|
88
|
+
@bayes.evaluate(query).should be_within(0.00001).of(0.77143)
|
89
|
+
end
|
90
|
+
end
|
91
|
+
|
92
|
+
context 'N likely' do
|
93
|
+
it 'should return probability of being Y' do
|
94
|
+
query = Idhja22::Dataset::Datum.new(['high', 'young', 'strawberry', 'cheddar'], ['confidence', 'age', 'fav ice cream', 'fav cheese'], 'Loves Reading')
|
95
|
+
@bayes.evaluate(query).should be_within(0.00001).of(0.48387)
|
96
|
+
end
|
97
|
+
end
|
98
|
+
|
99
|
+
context 'unrecognised attribute value' do
|
100
|
+
it 'should throw an error' do
|
101
|
+
query = Idhja22::Dataset::Datum.new(['high', 'young', 'chocolate', 'cheddar'], ['confidence', 'age', 'fav ice cream', 'fav cheese'], 'Loves Reading')
|
102
|
+
expect { @bayes.evaluate(query) }.to raise_error(Idhja22::Dataset::Datum::UnknownAttributeValue)
|
103
|
+
end
|
104
|
+
end
|
105
|
+
end
|
106
|
+
|
107
|
+
describe '#validate' do
|
108
|
+
before(:all) do
|
109
|
+
@bayes = Idhja22::Bayes.train(@ds)
|
110
|
+
end
|
111
|
+
|
112
|
+
it 'should return the average probability that the tree gets the validation examples correct' do
|
113
|
+
vds = Idhja22::Dataset.new([], ['0', '1','2','3','4'],'C')
|
114
|
+
vds << Idhja22::Dataset::Example.new(['a','a','a','a','a','Y'],['0', '1','2','3','4'],'C')
|
115
|
+
vds << Idhja22::Dataset::Example.new(['a','a','a','a','a','N'],['0', '1','2','3','4'],'C')
|
116
|
+
@bayes.validate(vds).should == 0.5
|
117
|
+
end
|
118
|
+
end
|
5
119
|
end
|
File without changes
|
File without changes
|
File without changes
|
data/spec/dataset_spec.rb
CHANGED
@@ -10,7 +10,7 @@ describe Idhja22::Dataset do
|
|
10
10
|
|
11
11
|
describe 'from_csv' do
|
12
12
|
before(:all) do
|
13
|
-
@ds = Idhja22::Dataset.from_csv(File.join(
|
13
|
+
@ds = Idhja22::Dataset.from_csv(File.join(data_dir,'spec_data.csv'))
|
14
14
|
end
|
15
15
|
|
16
16
|
it 'should extract labels' do
|
@@ -50,7 +50,7 @@ describe Idhja22::Dataset do
|
|
50
50
|
|
51
51
|
context 'ready made' do
|
52
52
|
before(:all) do
|
53
|
-
@ds = Idhja22::Dataset.from_csv(File.join(
|
53
|
+
@ds = Idhja22::Dataset.from_csv(File.join(data_dir,'large_spec_data.csv'))
|
54
54
|
end
|
55
55
|
|
56
56
|
describe '#partition' do
|
@@ -125,6 +125,53 @@ describe Idhja22::Dataset do
|
|
125
125
|
vs.size.should == 3
|
126
126
|
end
|
127
127
|
end
|
128
|
+
|
129
|
+
describe '#partition_by_category' do
|
130
|
+
it 'should divide the data set into a set of all Ys and a set of all Ns' do
|
131
|
+
sets = @ds.partition_by_category
|
132
|
+
sets.length.should == 2
|
133
|
+
sets['Y'].data.collect(&:category).uniq.should == ['Y']
|
134
|
+
sets['N'].data.collect(&:category).uniq.should == ['N']
|
135
|
+
end
|
136
|
+
end
|
137
|
+
|
138
|
+
describe '#<<' do
|
139
|
+
it 'should all datum to list of data' do
|
140
|
+
added_datum = Idhja22::Dataset::Example.new(['a','b','c','d','e', 'Y'],['0','1','2','3','4'],'C')
|
141
|
+
expect { @ds << added_datum}.to change(@ds, :size)
|
142
|
+
@ds.data.last.should == added_datum
|
143
|
+
end
|
144
|
+
|
145
|
+
context 'mismatched category label' do
|
146
|
+
it 'should throw an error' do
|
147
|
+
added_datum = Idhja22::Dataset::Example.new(['a','b','c','d','e', 'Y'],['0','1','2','3','4'],'D')
|
148
|
+
expect { @ds << added_datum}.to raise_error(Idhja22::Dataset::Datum::UnknownCategoryLabel)
|
149
|
+
end
|
150
|
+
end
|
151
|
+
|
152
|
+
context 'mismatching attributes' do
|
153
|
+
context 'extra attribute' do
|
154
|
+
it 'should throw an error' do
|
155
|
+
added_datum = Idhja22::Dataset::Example.new(['a','b','c','d','e', 'f', 'Y'],['0','1','2','3','4', '5'],'C')
|
156
|
+
expect { @ds << added_datum}.to raise_error(Idhja22::Dataset::Datum::UnknownAttributeLabel)
|
157
|
+
end
|
158
|
+
end
|
159
|
+
|
160
|
+
context 'missing attribute' do
|
161
|
+
it 'should throw an error' do
|
162
|
+
added_datum = Idhja22::Dataset::Example.new(['a','b','c','d', 'Y'],['0','1','2','3'],'C')
|
163
|
+
expect { @ds << added_datum}.to raise_error(Idhja22::Dataset::Datum::UnknownAttributeLabel)
|
164
|
+
end
|
165
|
+
end
|
166
|
+
|
167
|
+
context 'different attribute' do
|
168
|
+
it 'should throw an error' do
|
169
|
+
added_datum = Idhja22::Dataset::Example.new(['a','b','c','d', 'e', 'Y'],['0','1','2','3','9'],'C')
|
170
|
+
expect { @ds << added_datum}.to raise_error(Idhja22::Dataset::Datum::UnknownAttributeLabel)
|
171
|
+
end
|
172
|
+
end
|
173
|
+
end
|
174
|
+
end
|
128
175
|
end
|
129
176
|
end
|
130
177
|
end
|
@@ -0,0 +1,205 @@
|
|
1
|
+
require 'spec_helper'
|
2
|
+
|
3
|
+
describe Idhja22::DecisionNode do
|
4
|
+
before(:all) do
|
5
|
+
@ds = Idhja22::Dataset.from_csv(File.join(data_dir,'large_spec_data.csv'))
|
6
|
+
@simple_decision_node = Idhja22::DecisionNode.new('3')
|
7
|
+
|
8
|
+
l1 = Idhja22::LeafNode.new(0.75, 'C')
|
9
|
+
l2 = Idhja22::LeafNode.new(0.0, 'C')
|
10
|
+
|
11
|
+
@simple_decision_node.add_branch('a', l1)
|
12
|
+
@simple_decision_node.add_branch('b', l2)
|
13
|
+
end
|
14
|
+
|
15
|
+
describe('#get_rules') do
|
16
|
+
it 'should return a list of rules' do
|
17
|
+
@simple_decision_node.get_rules.should == ["3 == a and then chance of C = 0.75", "3 == b and then chance of C = 0.0"]
|
18
|
+
end
|
19
|
+
end
|
20
|
+
|
21
|
+
describe '#leaves' do
|
22
|
+
it 'should return a list of terminating values' do
|
23
|
+
@simple_decision_node.leaves.should == [Idhja22::LeafNode.new(0.75, 'C'), Idhja22::LeafNode.new(0.0, 'C')]
|
24
|
+
end
|
25
|
+
|
26
|
+
context 'a branch without a terminating leaf node' do
|
27
|
+
it 'should throw an error' do
|
28
|
+
decision_node = Idhja22::DecisionNode.new('a')
|
29
|
+
decision_node.add_branch('1', Idhja22::LeafNode.new(0.75, 'C'))
|
30
|
+
decision_node.add_branch('2', Idhja22::DecisionNode.new('b'))
|
31
|
+
|
32
|
+
expect { decision_node.leaves }.to raise_error(Idhja22::IncompleteTree)
|
33
|
+
end
|
34
|
+
end
|
35
|
+
end
|
36
|
+
|
37
|
+
describe(' == ') do
|
38
|
+
it 'should return false with different decision attributes' do
|
39
|
+
dn = Idhja22::DecisionNode.new('2')
|
40
|
+
diff_dn = Idhja22::DecisionNode.new('3')
|
41
|
+
dn.should_not == diff_dn
|
42
|
+
end
|
43
|
+
|
44
|
+
it 'should return false with different branches' do
|
45
|
+
dn1 = Idhja22::DecisionNode.new('2')
|
46
|
+
diff_dn = Idhja22::DecisionNode.new('2')
|
47
|
+
|
48
|
+
leaf = Idhja22::LeafNode.new(0.75, 'C')
|
49
|
+
dn1.add_branch('value', leaf)
|
50
|
+
|
51
|
+
dn1.should_not == diff_dn
|
52
|
+
end
|
53
|
+
|
54
|
+
it 'should return true if decision node and branches match' do
|
55
|
+
dn1 = Idhja22::DecisionNode.new('2')
|
56
|
+
dn2 = Idhja22::DecisionNode.new('2')
|
57
|
+
|
58
|
+
leaf = Idhja22::LeafNode.new(0.75, 'C')
|
59
|
+
dn1.add_branch('value', leaf)
|
60
|
+
dn2.add_branch('value', leaf)
|
61
|
+
|
62
|
+
dn1.should == dn2
|
63
|
+
end
|
64
|
+
end
|
65
|
+
|
66
|
+
describe 'category_label' do
|
67
|
+
it 'should return the category_label from the leaves' do
|
68
|
+
@simple_decision_node.category_label.should == 'C'
|
69
|
+
end
|
70
|
+
|
71
|
+
context 'incomplete node' do
|
72
|
+
it 'should throw an error' do
|
73
|
+
dn = Idhja22::DecisionNode.new('a')
|
74
|
+
expect { dn.category_label }.to raise_error(Idhja22::IncompleteTree)
|
75
|
+
end
|
76
|
+
end
|
77
|
+
|
78
|
+
end
|
79
|
+
|
80
|
+
describe 'evaluate' do
|
81
|
+
it 'should follow node to probability' do
|
82
|
+
query = Idhja22::Dataset::Datum.new(['a', 'a'], ['3', '4'], 'C')
|
83
|
+
@simple_decision_node.evaluate(query).should == 0.75
|
84
|
+
|
85
|
+
query = Idhja22::Dataset::Datum.new(['b', 'a'], ['3', '4'], 'C')
|
86
|
+
@simple_decision_node.evaluate(query).should == 0.0
|
87
|
+
end
|
88
|
+
|
89
|
+
context 'mismatching attribute label' do
|
90
|
+
it 'should raise an error' do
|
91
|
+
query = Idhja22::Dataset::Datum.new(['b', 'a'], ['1', '2'], 'C')
|
92
|
+
expect {@simple_decision_node.evaluate(query)}.to raise_error(Idhja22::Dataset::Datum::UnknownAttributeLabel)
|
93
|
+
end
|
94
|
+
end
|
95
|
+
|
96
|
+
context 'unknown attribute value' do
|
97
|
+
it 'should raise an error' do
|
98
|
+
query = Idhja22::Dataset::Datum.new(['c', 'a'], ['3', '4'], 'C')
|
99
|
+
expect {@simple_decision_node.evaluate(query)}.to raise_error(Idhja22::Dataset::Datum::UnknownAttributeValue)
|
100
|
+
end
|
101
|
+
end
|
102
|
+
end
|
103
|
+
|
104
|
+
describe('.build') do
|
105
|
+
it 'should build a decision node based on the provided data' do
|
106
|
+
node = Idhja22::DecisionNode.build(@ds, @ds.attribute_labels, 0)
|
107
|
+
node.decision_attribute.should == "2"
|
108
|
+
node.branches.keys.should == ['a','b']
|
109
|
+
end
|
110
|
+
|
111
|
+
it 'should cleanup matching tails' do
|
112
|
+
ds = Idhja22::Dataset.from_csv(File.join(data_dir,'evenly_split.csv'))
|
113
|
+
node = Idhja22::DecisionNode.build(ds, ds.attribute_labels, 0)
|
114
|
+
node.get_rules.should == ['1 == a and then chance of C = 0.5', '1 == b and then chance of C = 0.5']
|
115
|
+
end
|
116
|
+
end
|
117
|
+
|
118
|
+
describe '#add_branch' do
|
119
|
+
it 'should add a branch for the given attribute value' do
|
120
|
+
node = Idhja22::DecisionNode.new 'attribute_name'
|
121
|
+
branch_node = Idhja22::DecisionNode.new 'other_name'
|
122
|
+
node.add_branch('value', branch_node)
|
123
|
+
node.branches.keys.should == ['value']
|
124
|
+
node.branches['value'].should == branch_node
|
125
|
+
end
|
126
|
+
end
|
127
|
+
|
128
|
+
describe '#cleanup_children!' do
|
129
|
+
context 'with matching output at level below' do
|
130
|
+
before(:all) do
|
131
|
+
@dn = Idhja22::DecisionNode.new('a')
|
132
|
+
@dn_below = Idhja22::DecisionNode.new('b')
|
133
|
+
@dn_below.add_branch('1', Idhja22::LeafNode.new(0.505, 'Category'))
|
134
|
+
@dn_below.add_branch('2', Idhja22::LeafNode.new(0.50, 'Category'))
|
135
|
+
@dn.add_branch('1', @dn_below)
|
136
|
+
end
|
137
|
+
it 'should merge any subnodes with same output into a single leafnode' do
|
138
|
+
@dn.cleanup_children!
|
139
|
+
@dn.branches['1'].should == Idhja22::LeafNode.new(0.505, 'Category')
|
140
|
+
end
|
141
|
+
end
|
142
|
+
|
143
|
+
context 'with matching output at two levels below' do
|
144
|
+
before(:all) do
|
145
|
+
@dn = Idhja22::DecisionNode.new('a')
|
146
|
+
@dn_1_below = Idhja22::DecisionNode.new('b')
|
147
|
+
@dn.add_branch('1', @dn_1_below)
|
148
|
+
|
149
|
+
@dn_2_below = Idhja22::DecisionNode.new('c')
|
150
|
+
@dn_1_below.add_branch('1', @dn_2_below)
|
151
|
+
|
152
|
+
@dn_2_below.add_branch('1', Idhja22::LeafNode.new(0.50, 'Category'))
|
153
|
+
@dn_2_below.add_branch('2', Idhja22::LeafNode.new(0.50, 'Category'))
|
154
|
+
end
|
155
|
+
|
156
|
+
it 'should merge nodes recusively' do
|
157
|
+
@dn.cleanup_children!
|
158
|
+
@dn.branches['1'].should == Idhja22::LeafNode.new(0.50, 'Category')
|
159
|
+
end
|
160
|
+
end
|
161
|
+
|
162
|
+
context 'with diverging branches that match internally' do
|
163
|
+
before(:all) do
|
164
|
+
@dn = Idhja22::DecisionNode.new('a')
|
165
|
+
|
166
|
+
dn_1_below = Idhja22::DecisionNode.new('b')
|
167
|
+
@dn.add_branch('1', dn_1_below)
|
168
|
+
|
169
|
+
dn_2_below = Idhja22::DecisionNode.new('c')
|
170
|
+
dn_1_below.add_branch('1', dn_2_below)
|
171
|
+
|
172
|
+
dn_2_below.add_branch('1', Idhja22::LeafNode.new(0.50, 'Category'))
|
173
|
+
dn_2_below.add_branch('2', Idhja22::LeafNode.new(0.50, 'Category'))
|
174
|
+
|
175
|
+
dn_2_below = Idhja22::DecisionNode.new('d')
|
176
|
+
dn_1_below.add_branch('2', dn_2_below)
|
177
|
+
|
178
|
+
dn_2_below.add_branch('1', Idhja22::LeafNode.new(0.70, 'Category'))
|
179
|
+
dn_2_below.add_branch('2', Idhja22::LeafNode.new(0.70, 'Category'))
|
180
|
+
end
|
181
|
+
|
182
|
+
it 'should merge nodes recusively' do
|
183
|
+
@dn.cleanup_children!
|
184
|
+
@dn.branches['1'].branches['1'].should == Idhja22::LeafNode.new(0.50, 'Category')
|
185
|
+
@dn.branches['1'].branches['2'].should == Idhja22::LeafNode.new(0.70, 'Category')
|
186
|
+
end
|
187
|
+
end
|
188
|
+
|
189
|
+
context 'without matching output' do
|
190
|
+
before(:all) do
|
191
|
+
@dn = Idhja22::DecisionNode.new('a')
|
192
|
+
@dn_below = Idhja22::DecisionNode.new('b')
|
193
|
+
@dn_below.add_branch('1', Idhja22::LeafNode.new(0.2, 'Category'))
|
194
|
+
@dn_below.add_branch('2', Idhja22::LeafNode.new(0.70, 'Category'))
|
195
|
+
@dn.add_branch('1', @dn_below)
|
196
|
+
end
|
197
|
+
|
198
|
+
it 'should do nothing' do
|
199
|
+
saved_rules = @dn.get_rules
|
200
|
+
@dn.cleanup_children!
|
201
|
+
@dn.get_rules.should == saved_rules
|
202
|
+
end
|
203
|
+
end
|
204
|
+
end
|
205
|
+
end
|
@@ -0,0 +1,53 @@
|
|
1
|
+
require 'spec_helper'
|
2
|
+
|
3
|
+
describe Idhja22::LeafNode do
|
4
|
+
describe('.new') do
|
5
|
+
it 'should store probability and category label' do
|
6
|
+
l = Idhja22::LeafNode.new(0.75, 'label')
|
7
|
+
l.probability.should == 0.75
|
8
|
+
l.category_label.should == 'label'
|
9
|
+
end
|
10
|
+
end
|
11
|
+
|
12
|
+
describe('#get_rules') do
|
13
|
+
it 'should return the probability' do
|
14
|
+
l = Idhja22::LeafNode.new(0.75, 'pudding')
|
15
|
+
l.get_rules.should == ['then chance of pudding = 0.75']
|
16
|
+
end
|
17
|
+
end
|
18
|
+
|
19
|
+
describe(' == ') do
|
20
|
+
let(:l1) { Idhja22::LeafNode.new(0.75, 'pudding') }
|
21
|
+
let(:l2) { Idhja22::LeafNode.new(0.75, 'pudding') }
|
22
|
+
let(:diff_l1) { Idhja22::LeafNode.new(0.7, 'pudding') }
|
23
|
+
let(:diff_l2) { Idhja22::LeafNode.new(0.75, 'starter') }
|
24
|
+
it 'should compare attributes' do
|
25
|
+
l1.should == l2
|
26
|
+
l1.should_not == diff_l1
|
27
|
+
l1.should_not == diff_l2
|
28
|
+
end
|
29
|
+
end
|
30
|
+
|
31
|
+
describe 'evaluate' do
|
32
|
+
let(:leaf) { Idhja22::LeafNode.new(0.6, 'pudding') }
|
33
|
+
|
34
|
+
it 'should return probability' do
|
35
|
+
query = Idhja22::Dataset::Datum.new(['high', 'gusty'], ['temperature', 'windy'], 'pudding')
|
36
|
+
leaf.evaluate(query).should == 0.6
|
37
|
+
end
|
38
|
+
|
39
|
+
context 'mismatching category labels' do
|
40
|
+
it 'should raise error' do
|
41
|
+
query = Idhja22::Dataset::Datum.new(['high', 'gusty'], ['temperature', 'windy'], 'tennis')
|
42
|
+
expect {leaf.evaluate(query)}.to raise_error(Idhja22::Dataset::Datum::UnknownCategoryLabel)
|
43
|
+
end
|
44
|
+
end
|
45
|
+
end
|
46
|
+
|
47
|
+
describe '#leaves' do
|
48
|
+
it 'should return self' do
|
49
|
+
leaf = Idhja22::LeafNode.new(0.6, 'pudding')
|
50
|
+
leaf.leaves.should == [leaf]
|
51
|
+
end
|
52
|
+
end
|
53
|
+
end
|
data/spec/spec_helper.rb
CHANGED
data/spec/tree_spec.rb
CHANGED
@@ -2,7 +2,7 @@ require 'spec_helper'
|
|
2
2
|
|
3
3
|
describe Idhja22::Tree do
|
4
4
|
before(:all) do
|
5
|
-
@ds = Idhja22::Dataset.from_csv(File.join(
|
5
|
+
@ds = Idhja22::Dataset.from_csv(File.join(data_dir,'large_spec_data.csv'))
|
6
6
|
end
|
7
7
|
|
8
8
|
|
@@ -29,7 +29,7 @@ describe Idhja22::Tree do
|
|
29
29
|
it 'should compare root nodes' do
|
30
30
|
tree1 = Idhja22::Tree.train(@ds)
|
31
31
|
tree2 = Idhja22::Tree.train(@ds)
|
32
|
-
diff_ds = Idhja22::Dataset.from_csv(File.join(
|
32
|
+
diff_ds = Idhja22::Dataset.from_csv(File.join(data_dir,'another_large_spec_data.csv'))
|
33
33
|
diff_tree = Idhja22::Tree.train(diff_ds)
|
34
34
|
tree1.should == tree2
|
35
35
|
tree1.should_not == diff_tree
|
@@ -39,7 +39,7 @@ describe Idhja22::Tree do
|
|
39
39
|
describe('.train_from_csv') do
|
40
40
|
it 'should make the same tree as the one from the dataset' do
|
41
41
|
tree = Idhja22::Tree.train(@ds)
|
42
|
-
csv_tree = Idhja22::Tree.train_from_csv(File.join(
|
42
|
+
csv_tree = Idhja22::Tree.train_from_csv(File.join(data_dir,'large_spec_data.csv'))
|
43
43
|
tree.should == csv_tree
|
44
44
|
end
|
45
45
|
end
|
@@ -85,7 +85,7 @@ describe Idhja22::Tree do
|
|
85
85
|
|
86
86
|
describe('.train_and_validate_from_csv') do
|
87
87
|
it 'should make the same tree as the one from the dataset' do
|
88
|
-
csv_tree, validation_value = Idhja22::Tree.train_and_validate_from_csv(File.join(
|
88
|
+
csv_tree, validation_value = Idhja22::Tree.train_and_validate_from_csv(File.join(data_dir,'large_spec_data.csv'), :"training-proportion" => 0.75)
|
89
89
|
csv_tree.is_a?(Idhja22::Tree).should be_true
|
90
90
|
(0..1).include?(validation_value).should be_true
|
91
91
|
end
|
data/spec/version_spec.rb
CHANGED
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: idhja22
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version:
|
4
|
+
version: 1.1.0
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -9,7 +9,7 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date: 2012-12-
|
12
|
+
date: 2012-12-20 00:00:00.000000000 Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: rspec
|
@@ -123,7 +123,7 @@ dependencies:
|
|
123
123
|
- - ! '>='
|
124
124
|
- !ruby/object:Gem::Version
|
125
125
|
version: '0'
|
126
|
-
description:
|
126
|
+
description: Classifiers
|
127
127
|
email:
|
128
128
|
executables:
|
129
129
|
- idhja22
|
@@ -141,6 +141,7 @@ files:
|
|
141
141
|
- idhja22.gemspec
|
142
142
|
- lib/idhja22.rb
|
143
143
|
- lib/idhja22/bayes.rb
|
144
|
+
- lib/idhja22/binary_classifier.rb
|
144
145
|
- lib/idhja22/config/default.rb
|
145
146
|
- lib/idhja22/dataset.rb
|
146
147
|
- lib/idhja22/dataset/datum.rb
|
@@ -149,13 +150,15 @@ files:
|
|
149
150
|
- lib/idhja22/tree.rb
|
150
151
|
- lib/idhja22/tree/node.rb
|
151
152
|
- lib/idhja22/version.rb
|
152
|
-
- spec/another_large_spec_data.csv
|
153
153
|
- spec/bayes_spec.rb
|
154
|
+
- spec/data/another_large_spec_data.csv
|
155
|
+
- spec/data/evenly_split.csv
|
156
|
+
- spec/data/large_spec_data.csv
|
157
|
+
- spec/data/spec_data.csv
|
154
158
|
- spec/dataset/example_spec.rb
|
155
159
|
- spec/dataset_spec.rb
|
156
|
-
- spec/
|
157
|
-
- spec/
|
158
|
-
- spec/spec_data.csv
|
160
|
+
- spec/node/decision_node_spec.rb
|
161
|
+
- spec/node/leaf_node_spec.rb
|
159
162
|
- spec/spec_helper.rb
|
160
163
|
- spec/tree_spec.rb
|
161
164
|
- spec/version_spec.rb
|
@@ -173,7 +176,7 @@ required_ruby_version: !ruby/object:Gem::Requirement
|
|
173
176
|
version: '0'
|
174
177
|
segments:
|
175
178
|
- 0
|
176
|
-
hash:
|
179
|
+
hash: 3479458333568153307
|
177
180
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
178
181
|
none: false
|
179
182
|
requirements:
|
@@ -182,21 +185,23 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
182
185
|
version: '0'
|
183
186
|
segments:
|
184
187
|
- 0
|
185
|
-
hash:
|
188
|
+
hash: 3479458333568153307
|
186
189
|
requirements: []
|
187
190
|
rubyforge_project:
|
188
191
|
rubygems_version: 1.8.24
|
189
192
|
signing_key:
|
190
193
|
specification_version: 3
|
191
|
-
summary: A gem for creating decision trees
|
194
|
+
summary: A gem for creating classifiers (decision trees and naive Bayes so far)
|
192
195
|
test_files:
|
193
|
-
- spec/another_large_spec_data.csv
|
194
196
|
- spec/bayes_spec.rb
|
197
|
+
- spec/data/another_large_spec_data.csv
|
198
|
+
- spec/data/evenly_split.csv
|
199
|
+
- spec/data/large_spec_data.csv
|
200
|
+
- spec/data/spec_data.csv
|
195
201
|
- spec/dataset/example_spec.rb
|
196
202
|
- spec/dataset_spec.rb
|
197
|
-
- spec/
|
198
|
-
- spec/
|
199
|
-
- spec/spec_data.csv
|
203
|
+
- spec/node/decision_node_spec.rb
|
204
|
+
- spec/node/leaf_node_spec.rb
|
200
205
|
- spec/spec_helper.rb
|
201
206
|
- spec/tree_spec.rb
|
202
207
|
- spec/version_spec.rb
|
data/spec/node_spec.rb
DELETED
@@ -1,97 +0,0 @@
|
|
1
|
-
require 'spec_helper'
|
2
|
-
|
3
|
-
describe Idhja22::LeafNode do
|
4
|
-
describe('.new') do
|
5
|
-
it 'should store probability and category label' do
|
6
|
-
l = Idhja22::LeafNode.new(0.75, 'label')
|
7
|
-
l.probability.should == 0.75
|
8
|
-
l.category_label.should == 'label'
|
9
|
-
end
|
10
|
-
end
|
11
|
-
|
12
|
-
describe('#get_rules') do
|
13
|
-
it 'should return the probability' do
|
14
|
-
l = Idhja22::LeafNode.new(0.75, 'pudding')
|
15
|
-
l.get_rules.should == ['then chance of pudding = 0.75']
|
16
|
-
end
|
17
|
-
end
|
18
|
-
|
19
|
-
describe(' == ') do
|
20
|
-
let(:l1) { Idhja22::LeafNode.new(0.75, 'pudding') }
|
21
|
-
let(:l2) { Idhja22::LeafNode.new(0.75, 'pudding') }
|
22
|
-
let(:diff_l1) { Idhja22::LeafNode.new(0.7, 'pudding') }
|
23
|
-
let(:diff_l2) { Idhja22::LeafNode.new(0.75, 'starter') }
|
24
|
-
it 'should compare attributes' do
|
25
|
-
l1.should == l2
|
26
|
-
l1.should_not == diff_l1
|
27
|
-
l1.should_not == diff_l2
|
28
|
-
end
|
29
|
-
end
|
30
|
-
|
31
|
-
describe 'evaluate' do
|
32
|
-
let(:leaf) { Idhja22::LeafNode.new(0.6, 'pudding') }
|
33
|
-
|
34
|
-
it 'should return probability' do
|
35
|
-
query = Idhja22::Dataset::Datum.new(['high', 'gusty'], ['temperature', 'windy'], 'pudding')
|
36
|
-
leaf.evaluate(query).should == 0.6
|
37
|
-
end
|
38
|
-
|
39
|
-
context 'mismatching category labels' do
|
40
|
-
it 'should raise error' do
|
41
|
-
query = Idhja22::Dataset::Datum.new(['high', 'gusty'], ['temperature', 'windy'], 'tennis')
|
42
|
-
expect {leaf.evaluate(query)}.to raise_error(Idhja22::Dataset::Datum::UnknownCategoryLabel)
|
43
|
-
end
|
44
|
-
end
|
45
|
-
end
|
46
|
-
end
|
47
|
-
|
48
|
-
describe Idhja22::DecisionNode do
|
49
|
-
before(:all) do
|
50
|
-
@ds = Idhja22::Dataset.from_csv(File.join(File.dirname(__FILE__),'large_spec_data.csv'))
|
51
|
-
end
|
52
|
-
|
53
|
-
describe('#get_rules') do
|
54
|
-
it 'should return a list of rules' do
|
55
|
-
l = Idhja22::DecisionNode.new(@ds.partition('2'), '3', [], 0, 0.75)
|
56
|
-
l.get_rules.should == ["3 == a and then chance of C = 0.75", "3 == b and then chance of C = 0.0"]
|
57
|
-
end
|
58
|
-
end
|
59
|
-
|
60
|
-
describe(' == ') do
|
61
|
-
let(:dn1) { Idhja22::DecisionNode.new(@ds.partition('2'), '2', [], 0, 0.75) }
|
62
|
-
let(:dn2) { Idhja22::DecisionNode.new(@ds.partition('2'), '2', [], 0, 0.75) }
|
63
|
-
let(:diff_dn1) { Idhja22::DecisionNode.new(@ds.partition('0'), '2', [], 0, 0.75) }
|
64
|
-
let(:diff_dn2) { Idhja22::DecisionNode.new(@ds.partition('3'), '3', [], 0, 0.75) }
|
65
|
-
|
66
|
-
it 'should compare ' do
|
67
|
-
dn1.should == dn2
|
68
|
-
dn1.should_not == diff_dn1
|
69
|
-
dn1.should_not == diff_dn2
|
70
|
-
end
|
71
|
-
end
|
72
|
-
|
73
|
-
describe 'evaluate' do
|
74
|
-
let(:dn) { Idhja22::DecisionNode.new(@ds.partition('2'), '3', [], 0, 0.75) }
|
75
|
-
it 'should follow node to probability' do
|
76
|
-
query = Idhja22::Dataset::Datum.new(['a', 'a'], ['3', '4'], 'C')
|
77
|
-
dn.evaluate(query).should == 0.75
|
78
|
-
|
79
|
-
query = Idhja22::Dataset::Datum.new(['b', 'a'], ['3', '4'], 'C')
|
80
|
-
dn.evaluate(query).should == 0.0
|
81
|
-
end
|
82
|
-
|
83
|
-
context 'mismatching attribute label' do
|
84
|
-
it 'should raise an error' do
|
85
|
-
query = Idhja22::Dataset::Datum.new(['b', 'a'], ['1', '2'], 'C')
|
86
|
-
expect {dn.evaluate(query)}.to raise_error(Idhja22::Dataset::Datum::UnknownAttributeLabel)
|
87
|
-
end
|
88
|
-
end
|
89
|
-
|
90
|
-
context 'unknown attribute value' do
|
91
|
-
it 'should raise an error' do
|
92
|
-
query = Idhja22::Dataset::Datum.new(['c', 'a'], ['3', '4'], 'C')
|
93
|
-
expect {dn.evaluate(query)}.to raise_error(Idhja22::Dataset::Datum::UnknownAttributeValue)
|
94
|
-
end
|
95
|
-
end
|
96
|
-
end
|
97
|
-
end
|