scoruby 0.2.4 → 0.2.5
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/.gitignore +2 -2
- data/.travis.yml +1 -1
- data/README.md +3 -61
- data/lib/scoruby/models/naive_bayes.rb +92 -0
- data/lib/scoruby/version.rb +1 -1
- metadata +3 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 6d01f41ef6e3e3485acb65587892402866004bd2
|
4
|
+
data.tar.gz: 7d898f2a4c86915e19952a3641286dd252d7c3fd
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: de6e7acbbcf5acd97f2b980253be94c7eb2fb7c31fb76c9c9d1fe6ca09447751974734370826d1477db639685a6ab37108ced5d85e4fc9b397285e1b61468a4e
|
7
|
+
data.tar.gz: 92e5728b151d7c3daa24e5fccebe6b7e8567c92af4fb03cdbc5b5ef20f5a75b53c88b5ef79e6d4db20df157995f45b6222677a0e1634702e3769240b61f025d4
|
data/.gitignore
CHANGED
data/.travis.yml
CHANGED
data/README.md
CHANGED
@@ -7,7 +7,7 @@
|
|
7
7
|
|
8
8
|
Ruby scoring API for Predictive Model Markup Language (PMML).
|
9
9
|
|
10
|
-
Currently supports
|
10
|
+
Currently supports Decision Tree, Random Forest and Gradient Boosted Models.
|
11
11
|
|
12
12
|
Will be happy to implement new models by demand, or assist with any other issue.
|
13
13
|
|
@@ -31,37 +31,8 @@ Or install it yourself as:
|
|
31
31
|
|
32
32
|
## Usage
|
33
33
|
### Random Forest
|
34
|
-
#### Generate PMML - R
|
35
|
-
|
36
|
-
```R
|
37
|
-
|
38
|
-
# Install and require randomForest, pmml packages
|
39
|
-
|
40
|
-
install.packages('randomForest')
|
41
|
-
install.packages('pmml')
|
42
|
-
library('randomForest')
|
43
|
-
library('pmml')
|
44
|
-
|
45
|
-
# Login to Kaggle and download titanic dataset
|
46
|
-
# https://www.kaggle.com/c/titanic/data
|
47
|
-
# Load CSV to data frame -
|
48
|
-
|
49
|
-
titanic.train <- read.table("titanic_train.csv", header = TRUE, sep = ",")
|
50
|
-
titanic.train$Survived <- as.factor(titanic.train$Survived)
|
51
|
-
|
52
|
-
# Train RF model
|
53
|
-
|
54
|
-
titanic.rf <- randomForest(Survived ~ . - Name - Cabin - Ticket,
|
55
|
-
data = titanic.train,
|
56
|
-
na.action = na.roughfix)
|
57
|
-
|
58
|
-
# Generate pmml from model
|
59
|
-
|
60
|
-
pmml <- pmml(titanic.rf)
|
61
|
-
saveXML(pmml, 'titanic_rf.pmml')
|
62
|
-
|
63
|
-
```
|
64
34
|
|
35
|
+
[Generate PMML - R](https://github.com/asafschers/scoruby/wiki/Random-Forest)
|
65
36
|
#### Classify by PMML - Ruby
|
66
37
|
|
67
38
|
```ruby
|
@@ -89,34 +60,7 @@ random_forest.decisions_count(features)
|
|
89
60
|
|
90
61
|
### Gradient Boosted model
|
91
62
|
|
92
|
-
|
93
|
-
|
94
|
-
```R
|
95
|
-
|
96
|
-
# Install and require gbm, r2pmml
|
97
|
-
|
98
|
-
library("devtools")
|
99
|
-
install_github(repo = "jpmml/r2pmml")
|
100
|
-
|
101
|
-
library("r2pmml")
|
102
|
-
library("gbm")
|
103
|
-
|
104
|
-
# Login to Kaggle and download titanic dataset
|
105
|
-
# https://www.kaggle.com/c/titanic/data
|
106
|
-
# Load CSV to data frame -
|
107
|
-
|
108
|
-
titanic.train <- read.table("titanic_train.csv", header = TRUE, sep = ",")
|
109
|
-
titanic.train$Survived <- as.factor(titanic.train$Survived)
|
110
|
-
|
111
|
-
# Train GBM model
|
112
|
-
|
113
|
-
titanic.gbm <- gbm(Survived ~ . - PassengerId - Name - Cabin - Ticket, data = titanic.train)
|
114
|
-
|
115
|
-
# Generate pmml from model
|
116
|
-
|
117
|
-
pmml <- r2pmml(titanic.gbm, 'titanic_gbm.pmml')
|
118
|
-
|
119
|
-
```
|
63
|
+
[Generate PMML - R](https://github.com/asafschers/scoruby/wiki/Gradient-Boosted-Model)
|
120
64
|
|
121
65
|
#### Classify by PMML - Ruby
|
122
66
|
|
@@ -142,8 +86,6 @@ gbm.score(features)
|
|
142
86
|
|
143
87
|
### Decision Tree
|
144
88
|
|
145
|
-
#### Classify by PMML - Ruby
|
146
|
-
|
147
89
|
```ruby
|
148
90
|
decision_tree = Scoruby.get_model 'decision_tree.pmml'
|
149
91
|
|
@@ -0,0 +1,92 @@
|
|
1
|
+
module Scoruby
|
2
|
+
module Models
|
3
|
+
class NaiveBayes
|
4
|
+
attr_reader :data
|
5
|
+
|
6
|
+
def initialize(xml)
|
7
|
+
@threshold = xml.xpath('//NaiveBayesModel').attr('threshold').value.to_f
|
8
|
+
@data = {}
|
9
|
+
xml.xpath('//BayesInput').each do |feature|
|
10
|
+
@data[feature.attr('fieldName').to_sym] = fetch_feature(feature)
|
11
|
+
end
|
12
|
+
|
13
|
+
@labels = {}
|
14
|
+
xml.xpath('//BayesOutput//TargetValueCount').each do |l| l.attr('value')
|
15
|
+
@labels[l.attr('value')] = { 'count': l.attr('count').to_f }
|
16
|
+
end
|
17
|
+
end
|
18
|
+
|
19
|
+
def lvalues(features)
|
20
|
+
@labels.each do |label, _|
|
21
|
+
features.each do |feature_name, feature_value|
|
22
|
+
|
23
|
+
if @data[feature_name][feature_value]
|
24
|
+
value_count = @data[feature_name][feature_value][label].to_f
|
25
|
+
overall_count = @data[feature_name].sum { |_, value| value[label].to_f }
|
26
|
+
|
27
|
+
@labels[label][feature_name] = value_count / overall_count
|
28
|
+
elsif @data[feature_name][label]
|
29
|
+
@labels[label][feature_name] = calc_numerical(@data[feature_name][label], feature_value)
|
30
|
+
end
|
31
|
+
end
|
32
|
+
end
|
33
|
+
|
34
|
+
lvalues = {}
|
35
|
+
@labels.each do |label, label_data|
|
36
|
+
label_data.each do |key, value|
|
37
|
+
label_data[key] = @threshold if value.round(5).zero?
|
38
|
+
end
|
39
|
+
lvalues[label] = label_data.values.reduce(:*)
|
40
|
+
end
|
41
|
+
lvalues
|
42
|
+
end
|
43
|
+
|
44
|
+
def score(features, label)
|
45
|
+
lvalues = lvalues(features)
|
46
|
+
lvalues[label] / lvalues.values.reduce(:+)
|
47
|
+
end
|
48
|
+
|
49
|
+
private
|
50
|
+
|
51
|
+
def calc_numerical(label_data, feature_value)
|
52
|
+
variance = label_data[:variance].to_f
|
53
|
+
mean = label_data[:mean].to_f
|
54
|
+
feature_value = feature_value.to_f
|
55
|
+
|
56
|
+
Math.exp(-(feature_value - mean)**2 / (2 * variance)) / Math.sqrt(2 * Math::PI * variance)
|
57
|
+
end
|
58
|
+
|
59
|
+
def fetch_feature(feature)
|
60
|
+
return fetch_numerical_feature(feature) if feature.child.name == 'TargetValueStats'
|
61
|
+
fetch_category_feature(feature)
|
62
|
+
end
|
63
|
+
|
64
|
+
def fetch_numerical_feature(feature)
|
65
|
+
features_data = {}
|
66
|
+
feature.child.children.each do |child|
|
67
|
+
features_data[child.attr('value').strip] = {
|
68
|
+
mean: child.child.attr('mean'),
|
69
|
+
variance: child.child.attr('variance')
|
70
|
+
}
|
71
|
+
end
|
72
|
+
features_data
|
73
|
+
end
|
74
|
+
|
75
|
+
def fetch_category_feature(feature)
|
76
|
+
feature_data = {}
|
77
|
+
feature.children.each do |category|
|
78
|
+
feature_data[category.attr('value')] = fetch_category(category)
|
79
|
+
end
|
80
|
+
feature_data
|
81
|
+
end
|
82
|
+
|
83
|
+
def fetch_category(category)
|
84
|
+
category_data = {}
|
85
|
+
category.child.children.each do |label|
|
86
|
+
category_data[label.attr('value')] = label.attr('count')
|
87
|
+
end
|
88
|
+
category_data
|
89
|
+
end
|
90
|
+
end
|
91
|
+
end
|
92
|
+
end
|
data/lib/scoruby/version.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: scoruby
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.2.
|
4
|
+
version: 0.2.5
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Asaf Schers
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2017-
|
11
|
+
date: 2017-09-14 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: bundler
|
@@ -131,6 +131,7 @@ files:
|
|
131
131
|
- lib/scoruby/features.rb
|
132
132
|
- lib/scoruby/models/decision_tree.rb
|
133
133
|
- lib/scoruby/models/gbm.rb
|
134
|
+
- lib/scoruby/models/naive_bayes.rb
|
134
135
|
- lib/scoruby/models/random_forest.rb
|
135
136
|
- lib/scoruby/models_factory.rb
|
136
137
|
- lib/scoruby/node.rb
|