scoruby 0.2.5 → 0.2.6
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/Gemfile.lock +5 -5
- data/lib/scoruby/models/naive_bayes/model.rb +64 -0
- data/lib/scoruby/models/naive_bayes/model_data.rb +68 -0
- data/lib/scoruby/models_factory.rb +6 -0
- data/lib/scoruby/version.rb +1 -1
- metadata +4 -3
- data/lib/scoruby/models/naive_bayes.rb +0 -92
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 2a1c339dc5c029d90d8e3a495af90499368bc92d
|
4
|
+
data.tar.gz: 5ff3c0147290c83d76261b9a6907e35ca493b678
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 8843f6c37d4b4d4e30f018348b5ec113e66558b2c78c945f4755ea01554b609af3dbb4a50334143ffa8f4bab205303b833831273a97bd18a2ecc9fa4e214f401
|
7
|
+
data.tar.gz: 94078bb11bd0cf3490c93c90e822fed248d4721e24a2623004d2cd3af8805407d6a69599ba2d753bd024caefb0d7e08eb8ddebefef1f37b3dbfe2dcdf49a3bad
|
data/Gemfile.lock
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
PATH
|
2
2
|
remote: .
|
3
3
|
specs:
|
4
|
-
scoruby (0.2.
|
4
|
+
scoruby (0.2.5)
|
5
5
|
nokogiri (~> 1.7)
|
6
6
|
|
7
7
|
GEM
|
@@ -18,9 +18,9 @@ GEM
|
|
18
18
|
docile (1.1.5)
|
19
19
|
json (2.1.0)
|
20
20
|
method_source (0.8.2)
|
21
|
-
mini_portile2 (2.
|
22
|
-
nokogiri (1.
|
23
|
-
mini_portile2 (~> 2.
|
21
|
+
mini_portile2 (2.2.0)
|
22
|
+
nokogiri (1.8.0)
|
23
|
+
mini_portile2 (~> 2.2.0)
|
24
24
|
pry (0.10.3)
|
25
25
|
coderay (~> 1.1.0)
|
26
26
|
method_source (~> 0.8.1)
|
@@ -64,4 +64,4 @@ DEPENDENCIES
|
|
64
64
|
scoruby!
|
65
65
|
|
66
66
|
BUNDLED WITH
|
67
|
-
1.
|
67
|
+
1.15.4
|
@@ -0,0 +1,64 @@
|
|
1
|
+
require 'scoruby/models/naive_bayes/model_data'
|
2
|
+
require 'forwardable'
|
3
|
+
|
4
|
+
module Scoruby
|
5
|
+
module Models
|
6
|
+
module NaiveBayes
|
7
|
+
class Model
|
8
|
+
extend Forwardable
|
9
|
+
def_delegators :@model_data, :threshold, :labels, :numerical_features, :category_features
|
10
|
+
|
11
|
+
def initialize(xml)
|
12
|
+
@model_data = ModelData.new(xml)
|
13
|
+
end
|
14
|
+
|
15
|
+
def lvalues(features)
|
16
|
+
calc_label_feature_values(features)
|
17
|
+
calc_label_values
|
18
|
+
end
|
19
|
+
|
20
|
+
def score(features, label)
|
21
|
+
lvalues(features)[label] / lvalues(features).values.reduce(:+)
|
22
|
+
end
|
23
|
+
|
24
|
+
private
|
25
|
+
|
26
|
+
def calc_label_values
|
27
|
+
label_values = {}
|
28
|
+
labels.each do |label, label_data|
|
29
|
+
label_data.each do |key, value|
|
30
|
+
label_data[key] = threshold if value.round(5).zero?
|
31
|
+
end
|
32
|
+
label_values[label] = label_data.values.reduce(:*)
|
33
|
+
end
|
34
|
+
label_values
|
35
|
+
end
|
36
|
+
|
37
|
+
def calc_label_feature_values(features)
|
38
|
+
labels.each do |label, _|
|
39
|
+
features.each do |feature_name, feature_value|
|
40
|
+
label_value = calc_category(feature_name, feature_value, label)
|
41
|
+
label_value ||= calc_numerical(feature_name, feature_value, label)
|
42
|
+
labels[label][feature_name] = label_value if label_value
|
43
|
+
end
|
44
|
+
end
|
45
|
+
end
|
46
|
+
|
47
|
+
def calc_category(feature_name, feature_value, label)
|
48
|
+
return unless category_features[feature_name] && category_features[feature_name][feature_value]
|
49
|
+
value_count = category_features[feature_name][feature_value][label].to_f
|
50
|
+
overall_count = category_features[feature_name].sum { |_, value| value[label].to_f }
|
51
|
+
value_count / overall_count
|
52
|
+
end
|
53
|
+
|
54
|
+
def calc_numerical(feature_name, feature_value, label)
|
55
|
+
return unless numerical_features[feature_name] && numerical_features[feature_name][label]
|
56
|
+
variance = numerical_features[feature_name][label][:variance].to_f
|
57
|
+
mean = numerical_features[feature_name][label][:mean].to_f
|
58
|
+
feature_value = feature_value.to_f
|
59
|
+
Math.exp(-(feature_value - mean)**2 / (2 * variance)) / Math.sqrt(2 * Math::PI * variance)
|
60
|
+
end
|
61
|
+
end
|
62
|
+
end
|
63
|
+
end
|
64
|
+
end
|
@@ -0,0 +1,68 @@
|
|
1
|
+
module Scoruby
|
2
|
+
module Models
|
3
|
+
module NaiveBayes
|
4
|
+
class ModelData
|
5
|
+
attr_reader :threshold, :labels, :numerical_features, :category_features
|
6
|
+
|
7
|
+
def initialize(xml)
|
8
|
+
@xml = xml
|
9
|
+
fetch_threshold
|
10
|
+
fetch_features_data
|
11
|
+
fetch_label_counts
|
12
|
+
end
|
13
|
+
|
14
|
+
private
|
15
|
+
|
16
|
+
def fetch_threshold
|
17
|
+
@threshold = @xml.xpath('//NaiveBayesModel').attr('threshold').value.to_f
|
18
|
+
end
|
19
|
+
|
20
|
+
def fetch_features_data
|
21
|
+
@category_features = {}
|
22
|
+
@numerical_features = {}
|
23
|
+
@xml.xpath('//BayesInput').each do |feature|
|
24
|
+
@category_features[feature.attr('fieldName').to_sym] = fetch_category_feature(feature)
|
25
|
+
@numerical_features[feature.attr('fieldName').to_sym] = fetch_numerical_feature(feature)
|
26
|
+
end
|
27
|
+
end
|
28
|
+
|
29
|
+
def fetch_label_counts
|
30
|
+
@labels = {}
|
31
|
+
@xml.xpath('//BayesOutput//TargetValueCount').each do |l|
|
32
|
+
l.attr('value')
|
33
|
+
@labels[l.attr('value')] = {'count': l.attr('count').to_f}
|
34
|
+
end
|
35
|
+
end
|
36
|
+
|
37
|
+
def fetch_numerical_feature(feature)
|
38
|
+
return unless feature.child.name == 'TargetValueStats'
|
39
|
+
features_data = {}
|
40
|
+
feature.child.children.each do |child|
|
41
|
+
features_data[child.attr('value').strip] = {
|
42
|
+
mean: child.child.attr('mean'),
|
43
|
+
variance: child.child.attr('variance')
|
44
|
+
}
|
45
|
+
end
|
46
|
+
features_data
|
47
|
+
end
|
48
|
+
|
49
|
+
def fetch_category_feature(feature)
|
50
|
+
return unless feature.children.any? { |f| f.name == 'PairCounts' }
|
51
|
+
feature_data = {}
|
52
|
+
feature.children.each do |category|
|
53
|
+
feature_data[category.attr('value')] = fetch_category(category)
|
54
|
+
end
|
55
|
+
feature_data
|
56
|
+
end
|
57
|
+
|
58
|
+
def fetch_category(category)
|
59
|
+
category_data = {}
|
60
|
+
category.child.children.each do |label|
|
61
|
+
category_data[label.attr('value')] = label.attr('count')
|
62
|
+
end
|
63
|
+
category_data
|
64
|
+
end
|
65
|
+
end
|
66
|
+
end
|
67
|
+
end
|
68
|
+
end
|
@@ -1,6 +1,7 @@
|
|
1
1
|
require 'scoruby/models/decision_tree'
|
2
2
|
require 'scoruby/models/gbm'
|
3
3
|
require 'scoruby/models/random_forest'
|
4
|
+
require 'scoruby/models/naive_bayes/model'
|
4
5
|
|
5
6
|
module Scoruby
|
6
7
|
class ModelsFactory
|
@@ -12,10 +13,15 @@ module Scoruby
|
|
12
13
|
return Models::RandomForest.new(xml) if random_forest?(xml)
|
13
14
|
return Models::Gbm.new(xml) if gbm?(xml)
|
14
15
|
return Models::DecisionTree.new(xml.child) if decision_tree?(xml)
|
16
|
+
return Models::NaiveBayes::Model.new(xml) if naive_bayes?(xml)
|
15
17
|
|
16
18
|
raise MODEL_NOT_SUPPORTED_ERROR
|
17
19
|
end
|
18
20
|
|
21
|
+
def self.naive_bayes?(xml)
|
22
|
+
!xml.xpath('PMML/NaiveBayesModel').empty?
|
23
|
+
end
|
24
|
+
|
19
25
|
def self.decision_tree?(xml)
|
20
26
|
!xml.xpath('PMML/TreeModel').empty?
|
21
27
|
end
|
data/lib/scoruby/version.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: scoruby
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.2.
|
4
|
+
version: 0.2.6
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Asaf Schers
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2017-09-
|
11
|
+
date: 2017-09-15 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: bundler
|
@@ -131,7 +131,8 @@ files:
|
|
131
131
|
- lib/scoruby/features.rb
|
132
132
|
- lib/scoruby/models/decision_tree.rb
|
133
133
|
- lib/scoruby/models/gbm.rb
|
134
|
-
- lib/scoruby/models/naive_bayes.rb
|
134
|
+
- lib/scoruby/models/naive_bayes/model.rb
|
135
|
+
- lib/scoruby/models/naive_bayes/model_data.rb
|
135
136
|
- lib/scoruby/models/random_forest.rb
|
136
137
|
- lib/scoruby/models_factory.rb
|
137
138
|
- lib/scoruby/node.rb
|
@@ -1,92 +0,0 @@
|
|
1
|
-
module Scoruby
|
2
|
-
module Models
|
3
|
-
class NaiveBayes
|
4
|
-
attr_reader :data
|
5
|
-
|
6
|
-
def initialize(xml)
|
7
|
-
@threshold = xml.xpath('//NaiveBayesModel').attr('threshold').value.to_f
|
8
|
-
@data = {}
|
9
|
-
xml.xpath('//BayesInput').each do |feature|
|
10
|
-
@data[feature.attr('fieldName').to_sym] = fetch_feature(feature)
|
11
|
-
end
|
12
|
-
|
13
|
-
@labels = {}
|
14
|
-
xml.xpath('//BayesOutput//TargetValueCount').each do |l| l.attr('value')
|
15
|
-
@labels[l.attr('value')] = { 'count': l.attr('count').to_f }
|
16
|
-
end
|
17
|
-
end
|
18
|
-
|
19
|
-
def lvalues(features)
|
20
|
-
@labels.each do |label, _|
|
21
|
-
features.each do |feature_name, feature_value|
|
22
|
-
|
23
|
-
if @data[feature_name][feature_value]
|
24
|
-
value_count = @data[feature_name][feature_value][label].to_f
|
25
|
-
overall_count = @data[feature_name].sum { |_, value| value[label].to_f }
|
26
|
-
|
27
|
-
@labels[label][feature_name] = value_count / overall_count
|
28
|
-
elsif @data[feature_name][label]
|
29
|
-
@labels[label][feature_name] = calc_numerical(@data[feature_name][label], feature_value)
|
30
|
-
end
|
31
|
-
end
|
32
|
-
end
|
33
|
-
|
34
|
-
lvalues = {}
|
35
|
-
@labels.each do |label, label_data|
|
36
|
-
label_data.each do |key, value|
|
37
|
-
label_data[key] = @threshold if value.round(5).zero?
|
38
|
-
end
|
39
|
-
lvalues[label] = label_data.values.reduce(:*)
|
40
|
-
end
|
41
|
-
lvalues
|
42
|
-
end
|
43
|
-
|
44
|
-
def score(features, label)
|
45
|
-
lvalues = lvalues(features)
|
46
|
-
lvalues[label] / lvalues.values.reduce(:+)
|
47
|
-
end
|
48
|
-
|
49
|
-
private
|
50
|
-
|
51
|
-
def calc_numerical(label_data, feature_value)
|
52
|
-
variance = label_data[:variance].to_f
|
53
|
-
mean = label_data[:mean].to_f
|
54
|
-
feature_value = feature_value.to_f
|
55
|
-
|
56
|
-
Math.exp(-(feature_value - mean)**2 / (2 * variance)) / Math.sqrt(2 * Math::PI * variance)
|
57
|
-
end
|
58
|
-
|
59
|
-
def fetch_feature(feature)
|
60
|
-
return fetch_numerical_feature(feature) if feature.child.name == 'TargetValueStats'
|
61
|
-
fetch_category_feature(feature)
|
62
|
-
end
|
63
|
-
|
64
|
-
def fetch_numerical_feature(feature)
|
65
|
-
features_data = {}
|
66
|
-
feature.child.children.each do |child|
|
67
|
-
features_data[child.attr('value').strip] = {
|
68
|
-
mean: child.child.attr('mean'),
|
69
|
-
variance: child.child.attr('variance')
|
70
|
-
}
|
71
|
-
end
|
72
|
-
features_data
|
73
|
-
end
|
74
|
-
|
75
|
-
def fetch_category_feature(feature)
|
76
|
-
feature_data = {}
|
77
|
-
feature.children.each do |category|
|
78
|
-
feature_data[category.attr('value')] = fetch_category(category)
|
79
|
-
end
|
80
|
-
feature_data
|
81
|
-
end
|
82
|
-
|
83
|
-
def fetch_category(category)
|
84
|
-
category_data = {}
|
85
|
-
category.child.children.each do |label|
|
86
|
-
category_data[label.attr('value')] = label.attr('count')
|
87
|
-
end
|
88
|
-
category_data
|
89
|
-
end
|
90
|
-
end
|
91
|
-
end
|
92
|
-
end
|