scoruby 0.2.4 → 0.2.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.gitignore +2 -2
- data/.travis.yml +1 -1
- data/README.md +3 -61
- data/lib/scoruby/models/naive_bayes.rb +92 -0
- data/lib/scoruby/version.rb +1 -1
- metadata +3 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 6d01f41ef6e3e3485acb65587892402866004bd2
|
4
|
+
data.tar.gz: 7d898f2a4c86915e19952a3641286dd252d7c3fd
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: de6e7acbbcf5acd97f2b980253be94c7eb2fb7c31fb76c9c9d1fe6ca09447751974734370826d1477db639685a6ab37108ced5d85e4fc9b397285e1b61468a4e
|
7
|
+
data.tar.gz: 92e5728b151d7c3daa24e5fccebe6b7e8567c92af4fb03cdbc5b5ef20f5a75b53c88b5ef79e6d4db20df157995f45b6222677a0e1634702e3769240b61f025d4
|
data/.gitignore
CHANGED
data/.travis.yml
CHANGED
data/README.md
CHANGED
@@ -7,7 +7,7 @@
|
|
7
7
|
|
8
8
|
Ruby scoring API for Predictive Model Markup Language (PMML).
|
9
9
|
|
10
|
-
Currently supports
|
10
|
+
Currently supports Decision Tree, Random Forest and Gradient Boosted Models.
|
11
11
|
|
12
12
|
Will be happy to implement new models by demand, or assist with any other issue.
|
13
13
|
|
@@ -31,37 +31,8 @@ Or install it yourself as:
|
|
31
31
|
|
32
32
|
## Usage
|
33
33
|
### Random Forest
|
34
|
-
#### Generate PMML - R
|
35
|
-
|
36
|
-
```R
|
37
|
-
|
38
|
-
# Install and require randomForest, pmml packages
|
39
|
-
|
40
|
-
install.packages('randomForest')
|
41
|
-
install.packages('pmml')
|
42
|
-
library('randomForest')
|
43
|
-
library('pmml')
|
44
|
-
|
45
|
-
# Login to Kaggle and download titanic dataset
|
46
|
-
# https://www.kaggle.com/c/titanic/data
|
47
|
-
# Load CSV to data frame -
|
48
|
-
|
49
|
-
titanic.train <- read.table("titanic_train.csv", header = TRUE, sep = ",")
|
50
|
-
titanic.train$Survived <- as.factor(titanic.train$Survived)
|
51
|
-
|
52
|
-
# Train RF model
|
53
|
-
|
54
|
-
titanic.rf <- randomForest(Survived ~ . - Name - Cabin - Ticket,
|
55
|
-
data = titanic.train,
|
56
|
-
na.action = na.roughfix)
|
57
|
-
|
58
|
-
# Generate pmml from model
|
59
|
-
|
60
|
-
pmml <- pmml(titanic.rf)
|
61
|
-
saveXML(pmml, 'titanic_rf.pmml')
|
62
|
-
|
63
|
-
```
|
64
34
|
|
35
|
+
[Generate PMML - R](https://github.com/asafschers/scoruby/wiki/Random-Forest)
|
65
36
|
#### Classify by PMML - Ruby
|
66
37
|
|
67
38
|
```ruby
|
@@ -89,34 +60,7 @@ random_forest.decisions_count(features)
|
|
89
60
|
|
90
61
|
### Gradient Boosted model
|
91
62
|
|
92
|
-
|
93
|
-
|
94
|
-
```R
|
95
|
-
|
96
|
-
# Install and require gbm, r2pmml
|
97
|
-
|
98
|
-
library("devtools")
|
99
|
-
install_github(repo = "jpmml/r2pmml")
|
100
|
-
|
101
|
-
library("r2pmml")
|
102
|
-
library("gbm")
|
103
|
-
|
104
|
-
# Login to Kaggle and download titanic dataset
|
105
|
-
# https://www.kaggle.com/c/titanic/data
|
106
|
-
# Load CSV to data frame -
|
107
|
-
|
108
|
-
titanic.train <- read.table("titanic_train.csv", header = TRUE, sep = ",")
|
109
|
-
titanic.train$Survived <- as.factor(titanic.train$Survived)
|
110
|
-
|
111
|
-
# Train GBM model
|
112
|
-
|
113
|
-
titanic.gbm <- gbm(Survived ~ . - PassengerId - Name - Cabin - Ticket, data = titanic.train)
|
114
|
-
|
115
|
-
# Generate pmml from model
|
116
|
-
|
117
|
-
pmml <- r2pmml(titanic.gbm, 'titanic_gbm.pmml')
|
118
|
-
|
119
|
-
```
|
63
|
+
[Generate PMML - R](https://github.com/asafschers/scoruby/wiki/Gradient-Boosted-Model)
|
120
64
|
|
121
65
|
#### Classify by PMML - Ruby
|
122
66
|
|
@@ -142,8 +86,6 @@ gbm.score(features)
|
|
142
86
|
|
143
87
|
### Decision Tree
|
144
88
|
|
145
|
-
#### Classify by PMML - Ruby
|
146
|
-
|
147
89
|
```ruby
|
148
90
|
decision_tree = Scoruby.get_model 'decision_tree.pmml'
|
149
91
|
|
@@ -0,0 +1,92 @@
|
|
1
|
+
module Scoruby
|
2
|
+
module Models
|
3
|
+
class NaiveBayes
|
4
|
+
attr_reader :data
|
5
|
+
|
6
|
+
def initialize(xml)
|
7
|
+
@threshold = xml.xpath('//NaiveBayesModel').attr('threshold').value.to_f
|
8
|
+
@data = {}
|
9
|
+
xml.xpath('//BayesInput').each do |feature|
|
10
|
+
@data[feature.attr('fieldName').to_sym] = fetch_feature(feature)
|
11
|
+
end
|
12
|
+
|
13
|
+
@labels = {}
|
14
|
+
xml.xpath('//BayesOutput//TargetValueCount').each do |l| l.attr('value')
|
15
|
+
@labels[l.attr('value')] = { 'count': l.attr('count').to_f }
|
16
|
+
end
|
17
|
+
end
|
18
|
+
|
19
|
+
def lvalues(features)
|
20
|
+
@labels.each do |label, _|
|
21
|
+
features.each do |feature_name, feature_value|
|
22
|
+
|
23
|
+
if @data[feature_name][feature_value]
|
24
|
+
value_count = @data[feature_name][feature_value][label].to_f
|
25
|
+
overall_count = @data[feature_name].sum { |_, value| value[label].to_f }
|
26
|
+
|
27
|
+
@labels[label][feature_name] = value_count / overall_count
|
28
|
+
elsif @data[feature_name][label]
|
29
|
+
@labels[label][feature_name] = calc_numerical(@data[feature_name][label], feature_value)
|
30
|
+
end
|
31
|
+
end
|
32
|
+
end
|
33
|
+
|
34
|
+
lvalues = {}
|
35
|
+
@labels.each do |label, label_data|
|
36
|
+
label_data.each do |key, value|
|
37
|
+
label_data[key] = @threshold if value.round(5).zero?
|
38
|
+
end
|
39
|
+
lvalues[label] = label_data.values.reduce(:*)
|
40
|
+
end
|
41
|
+
lvalues
|
42
|
+
end
|
43
|
+
|
44
|
+
def score(features, label)
|
45
|
+
lvalues = lvalues(features)
|
46
|
+
lvalues[label] / lvalues.values.reduce(:+)
|
47
|
+
end
|
48
|
+
|
49
|
+
private
|
50
|
+
|
51
|
+
def calc_numerical(label_data, feature_value)
|
52
|
+
variance = label_data[:variance].to_f
|
53
|
+
mean = label_data[:mean].to_f
|
54
|
+
feature_value = feature_value.to_f
|
55
|
+
|
56
|
+
Math.exp(-(feature_value - mean)**2 / (2 * variance)) / Math.sqrt(2 * Math::PI * variance)
|
57
|
+
end
|
58
|
+
|
59
|
+
def fetch_feature(feature)
|
60
|
+
return fetch_numerical_feature(feature) if feature.child.name == 'TargetValueStats'
|
61
|
+
fetch_category_feature(feature)
|
62
|
+
end
|
63
|
+
|
64
|
+
def fetch_numerical_feature(feature)
|
65
|
+
features_data = {}
|
66
|
+
feature.child.children.each do |child|
|
67
|
+
features_data[child.attr('value').strip] = {
|
68
|
+
mean: child.child.attr('mean'),
|
69
|
+
variance: child.child.attr('variance')
|
70
|
+
}
|
71
|
+
end
|
72
|
+
features_data
|
73
|
+
end
|
74
|
+
|
75
|
+
def fetch_category_feature(feature)
|
76
|
+
feature_data = {}
|
77
|
+
feature.children.each do |category|
|
78
|
+
feature_data[category.attr('value')] = fetch_category(category)
|
79
|
+
end
|
80
|
+
feature_data
|
81
|
+
end
|
82
|
+
|
83
|
+
def fetch_category(category)
|
84
|
+
category_data = {}
|
85
|
+
category.child.children.each do |label|
|
86
|
+
category_data[label.attr('value')] = label.attr('count')
|
87
|
+
end
|
88
|
+
category_data
|
89
|
+
end
|
90
|
+
end
|
91
|
+
end
|
92
|
+
end
|
data/lib/scoruby/version.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: scoruby
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.2.
|
4
|
+
version: 0.2.5
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Asaf Schers
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2017-
|
11
|
+
date: 2017-09-14 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: bundler
|
@@ -131,6 +131,7 @@ files:
|
|
131
131
|
- lib/scoruby/features.rb
|
132
132
|
- lib/scoruby/models/decision_tree.rb
|
133
133
|
- lib/scoruby/models/gbm.rb
|
134
|
+
- lib/scoruby/models/naive_bayes.rb
|
134
135
|
- lib/scoruby/models/random_forest.rb
|
135
136
|
- lib/scoruby/models_factory.rb
|
136
137
|
- lib/scoruby/node.rb
|