scoruby 0.1.3 → 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +112 -4
- data/lib/random_forest/decision_tree.rb +9 -3
- data/lib/random_forest/features.rb +16 -0
- data/lib/random_forest/gbm.rb +29 -0
- data/lib/random_forest/node.rb +11 -6
- data/lib/random_forest/random_forest.rb +2 -1
- data/lib/random_forest/simple_predicate.rb +0 -7
- data/lib/random_forest/simple_set_predicate.rb +0 -6
- data/lib/scoruby/version.rb +1 -1
- data/scoruby.gemspec +2 -1
- metadata +19 -8
- data/lib/gbm/gbm.rb +0 -23
- data/lib/gbm/gbm_decision_tree.rb +0 -37
- data/lib/gbm/gbm_node.rb +0 -23
- data/lib/gbm/gbm_predicate.rb +0 -18
- data/lib/random_forest/predicate.rb +0 -32
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: f1a4d1fcce322b50113aa532d34ea7397e17dfa7
|
4
|
+
data.tar.gz: e39e4e849ac6c98c14625372e1e17219b7ae2999
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: bdd1e14f38a6ab1675c54e7b795bc56f7bab845f6b05f3f8725c1b439737e23471dec7a3f94b60d982f8dbdef349a892d4c045c2f48ff45e0b0c06057599e86e
|
7
|
+
data.tar.gz: ec3ae16a4b3fd4b6299f89c925718ad9583fca0d7d27dc45661edc5ea8597a183869a29552aeaa74ddb01af92f421144a89cf1adedf93527e0d0811855c3c0f0
|
data/README.md
CHANGED
@@ -1,10 +1,17 @@
|
|
1
1
|
<a href="https://codeclimate.com/github/asafschers/scoruby"><img src="https://codeclimate.com/github/asafschers/scoruby/badges/gpa.svg" /></a>
|
2
|
-
[](https://coveralls.io/github/asafschers/scoruby?branch=master)
|
3
|
+
[](https://badge.fury.io/rb/scoruby)
|
3
4
|
[](https://travis-ci.org/asafschers/scoruby)
|
4
5
|
|
5
6
|
# Scoruby
|
6
7
|
|
7
|
-
|
8
|
+
Ruby scoring API for Predictive Model Markup Language (PMML).
|
9
|
+
|
10
|
+
Currently supports random forest and gradient boosted models.
|
11
|
+
|
12
|
+
Will be happy to implement new models by demand, or assist with any other issue.
|
13
|
+
|
14
|
+
Contact me here or at aschers@gmail.com.
|
8
15
|
|
9
16
|
## Installation
|
10
17
|
|
@@ -23,14 +30,115 @@ Or install it yourself as:
|
|
23
30
|
$ gem install scoruby
|
24
31
|
|
25
32
|
## Usage
|
33
|
+
### Random Forest
|
34
|
+
#### Generate PMML - R
|
35
|
+
|
36
|
+
```R
|
37
|
+
|
38
|
+
# Install and require randomForest, pmml packages
|
39
|
+
|
40
|
+
install.packages('randomForest')
|
41
|
+
install.packages('pmml')
|
42
|
+
library('randomForest')
|
43
|
+
library('pmml')
|
44
|
+
|
45
|
+
# Login to Kaggle and download titanic dataset
|
46
|
+
# https://www.kaggle.com/c/titanic/data
|
47
|
+
# Load CSV to data frame -
|
48
|
+
|
49
|
+
titanic.train <- read.table("titanic_train.csv", header = TRUE, sep = ",")
|
50
|
+
titanic.train$Survived <- as.factor(titanic.train$Survived)
|
51
|
+
|
52
|
+
# Train RF model
|
53
|
+
|
54
|
+
titanic.rf <- randomForest(Survived ~ . - Name - Cabin - Ticket,
|
55
|
+
data = titanic.train,
|
56
|
+
na.action = na.roughfix)
|
57
|
+
|
58
|
+
# Generate pmml from model
|
59
|
+
|
60
|
+
pmml <- pmml(titanic.rf)
|
61
|
+
saveXML(pmml, 'titanic_rf.pmml')
|
62
|
+
|
63
|
+
```
|
64
|
+
|
65
|
+
#### Classify by PMML - Ruby
|
26
66
|
|
27
67
|
```ruby
|
28
|
-
|
29
|
-
|
68
|
+
|
69
|
+
random_forest = Scoruby.get_model 'titanic_rf.pmml'
|
70
|
+
features = {
|
71
|
+
Sex: 'male',
|
72
|
+
Parch: 0,
|
73
|
+
Age: 30,
|
74
|
+
Fare: 9.6875,
|
75
|
+
Pclass: 2,
|
76
|
+
SibSp: 0,
|
77
|
+
Embarked: 'Q'
|
78
|
+
}
|
79
|
+
|
30
80
|
random_forest.predict(features)
|
81
|
+
|
82
|
+
=> "0"
|
83
|
+
|
31
84
|
random_forest.decisions_count(features)
|
85
|
+
|
86
|
+
=> {"0"=>441, "1"=>59}
|
87
|
+
|
32
88
|
```
|
33
89
|
|
90
|
+
### Gradient Boosted model
|
91
|
+
|
92
|
+
#### Generate PMML - R
|
93
|
+
|
94
|
+
```R
|
95
|
+
|
96
|
+
# Install and require gbm, r2pmml
|
97
|
+
|
98
|
+
library("devtools")
|
99
|
+
install_github(repo = "jpmml/r2pmml")
|
100
|
+
|
101
|
+
library("r2pmml")
|
102
|
+
library("gbm")
|
103
|
+
|
104
|
+
# Login to Kaggle and download titanic dataset
|
105
|
+
# https://www.kaggle.com/c/titanic/data
|
106
|
+
# Load CSV to data frame -
|
107
|
+
|
108
|
+
titanic.train <- read.table("titanic_train.csv", header = TRUE, sep = ",")
|
109
|
+
titanic.train$Survived <- as.factor(titanic.train$Survived)
|
110
|
+
|
111
|
+
# Train GBM model
|
112
|
+
|
113
|
+
titanic.gbm <- gbm(Survived ~ . - PassengerId - Name - Cabin - Ticket, data = titanic.train)
|
114
|
+
|
115
|
+
# Generate pmml from model
|
116
|
+
|
117
|
+
pmml <- r2pmml(titanic.gbm, 'titanic_gbm.pmml')
|
118
|
+
|
119
|
+
```
|
120
|
+
|
121
|
+
#### Classify by PMML - Ruby
|
122
|
+
|
123
|
+
```ruby
|
124
|
+
|
125
|
+
gbm = Scoruby.get_model 'gbm.pmml'
|
126
|
+
|
127
|
+
features = {
|
128
|
+
Sex: 'male',
|
129
|
+
Parch: 0,
|
130
|
+
Age: 30,
|
131
|
+
Fare: 9.6875,
|
132
|
+
Pclass: 2,
|
133
|
+
SibSp: 0,
|
134
|
+
Embarked: 'Q'
|
135
|
+
}
|
136
|
+
|
137
|
+
gbm.score(features)
|
138
|
+
|
139
|
+
=> 0.3652639329522468
|
140
|
+
|
141
|
+
```
|
34
142
|
|
35
143
|
## Development
|
36
144
|
|
@@ -23,14 +23,20 @@ class DecisionTree
|
|
23
23
|
private
|
24
24
|
|
25
25
|
def step(curr, features)
|
26
|
-
curr = curr
|
27
|
-
curr = curr
|
26
|
+
curr = step_on_true(curr, features, 0)
|
27
|
+
curr = step_on_true(curr, features, 1)
|
28
|
+
curr = step_on_true(curr, features, 2)
|
29
|
+
curr
|
30
|
+
end
|
31
|
+
|
32
|
+
def step_on_true(curr, features, num)
|
33
|
+
return curr.children[num] if curr.children && curr.children[num] && curr.children[num].true?(features)
|
28
34
|
curr
|
29
35
|
end
|
30
36
|
|
31
37
|
def didnt_step?(curr, prev)
|
32
38
|
return false if (prev.pred != curr.pred)
|
33
|
-
Scoruby.logger.error "Null tree: #{@id}, bad feature: #{curr.
|
39
|
+
Scoruby.logger.error "Null tree: #{@id}, bad feature: #{curr.children[0].pred.field }"
|
34
40
|
true
|
35
41
|
end
|
36
42
|
end
|
@@ -0,0 +1,16 @@
|
|
1
|
+
class Features
|
2
|
+
|
3
|
+
attr_reader :formatted
|
4
|
+
|
5
|
+
def initialize(features)
|
6
|
+
@formatted = format_booleans(features)
|
7
|
+
end
|
8
|
+
|
9
|
+
def format_booleans(features)
|
10
|
+
features.map { |k, v|
|
11
|
+
features[k] = 'f' if v == false
|
12
|
+
features[k] = 't' if v == true
|
13
|
+
}
|
14
|
+
features
|
15
|
+
end
|
16
|
+
end
|
@@ -0,0 +1,29 @@
|
|
1
|
+
require 'decision_tree'
|
2
|
+
require 'features'
|
3
|
+
|
4
|
+
class Gbm
|
5
|
+
GBM_FOREST_XPATH = '//Segmentation[@multipleModelMethod="sum"]/Segment'
|
6
|
+
CONST_XPATH = '//Target/@rescaleConstant'
|
7
|
+
|
8
|
+
def initialize(xml)
|
9
|
+
@decision_trees = xml.xpath(GBM_FOREST_XPATH).collect{ |xml_tree|
|
10
|
+
DecisionTree.new(xml_tree)
|
11
|
+
}
|
12
|
+
@const = Float(xml.xpath(CONST_XPATH).to_s)
|
13
|
+
end
|
14
|
+
|
15
|
+
def tree_count
|
16
|
+
@decision_trees.count
|
17
|
+
end
|
18
|
+
|
19
|
+
def score(features)
|
20
|
+
formatted_features = Features.new(features).formatted
|
21
|
+
x = @decision_trees.map { |dt|
|
22
|
+
score = dt.decide(formatted_features)
|
23
|
+
score.to_s.to_f
|
24
|
+
}.reduce(:+) + @const
|
25
|
+
Math.exp(x) / (1 + Math.exp(x))
|
26
|
+
end
|
27
|
+
|
28
|
+
end
|
29
|
+
|
data/lib/random_forest/node.rb
CHANGED
@@ -1,18 +1,23 @@
|
|
1
|
-
require '
|
1
|
+
require 'simple_predicate'
|
2
|
+
require 'simple_set_predicate'
|
2
3
|
|
3
4
|
class Node
|
4
5
|
|
5
|
-
attr_reader :decision, :
|
6
|
+
attr_reader :decision, :pred, :children
|
6
7
|
|
7
8
|
def initialize(xml)
|
8
9
|
children = xml.children
|
9
|
-
|
10
|
-
|
10
|
+
pred_xml = children[0]
|
11
|
+
@pred = SimplePredicate.new(pred_xml) if pred_xml.name == 'SimplePredicate'
|
12
|
+
@pred = SimpleSetPredicate.new(pred_xml) if pred_xml.name == 'SimpleSetPredicate'
|
13
|
+
@children = []
|
11
14
|
@decision = xml.attribute('score').to_s
|
12
15
|
|
13
16
|
return if children.count == 1
|
14
|
-
|
15
|
-
@
|
17
|
+
|
18
|
+
@children << Node.new(children[1]) if children[1]
|
19
|
+
@children << Node.new(children[2]) if children[2]
|
20
|
+
@children << Node.new(children[3]) if children[3]
|
16
21
|
end
|
17
22
|
|
18
23
|
def true?(features)
|
@@ -11,8 +11,9 @@ class RandomForest
|
|
11
11
|
end
|
12
12
|
|
13
13
|
def decisions_count(features)
|
14
|
+
formatted_features = Features.new(features).formatted
|
14
15
|
decisions = @decision_trees.collect { |decision_tree|
|
15
|
-
decision_tree.decide(
|
16
|
+
decision_tree.decide(formatted_features)
|
16
17
|
}
|
17
18
|
decisions.inject(Hash.new(0)) { |h, e| h[e] += 1 ; h }
|
18
19
|
end
|
@@ -20,9 +20,7 @@ class SimplePredicate
|
|
20
20
|
end
|
21
21
|
|
22
22
|
def true?(features)
|
23
|
-
format_boolean(features)
|
24
23
|
return num_true?(features) if MATH_OPS.include?(@operator)
|
25
|
-
|
26
24
|
return features[@field] == @value if @operator == EQUAL
|
27
25
|
features[field].nil? || !features.has_key?(field) if @operator == IS_MISSING
|
28
26
|
end
|
@@ -36,9 +34,4 @@ class SimplePredicate
|
|
36
34
|
return curr_value <= value if @operator == LESS_OR_EQUAL
|
37
35
|
curr_value >= value if @operator == GREATER_OR_EQUAL
|
38
36
|
end
|
39
|
-
|
40
|
-
def format_boolean(features)
|
41
|
-
features[@field] = 'f' if features[@field] == false
|
42
|
-
features[@field] = 't' if features[@field] == true
|
43
|
-
end
|
44
37
|
end
|
@@ -12,7 +12,6 @@ class SimpleSetPredicate
|
|
12
12
|
end
|
13
13
|
|
14
14
|
def true?(features)
|
15
|
-
format_boolean(features)
|
16
15
|
@array.include? features[@field] if @operator == IS_IN
|
17
16
|
end
|
18
17
|
|
@@ -23,9 +22,4 @@ class SimpleSetPredicate
|
|
23
22
|
reject(&:empty?).
|
24
23
|
map { |w| w.tr('"','')}
|
25
24
|
end
|
26
|
-
|
27
|
-
def format_boolean(features)
|
28
|
-
features[@field] = 'f' if features[@field] == false
|
29
|
-
features[@field] = 't' if features[@field] == true
|
30
|
-
end
|
31
25
|
end
|
data/lib/scoruby/version.rb
CHANGED
data/scoruby.gemspec
CHANGED
@@ -9,7 +9,7 @@ Gem::Specification.new do |spec|
|
|
9
9
|
spec.authors = ["Asaf Schers"]
|
10
10
|
spec.email = ["schers@riskified.com"]
|
11
11
|
|
12
|
-
spec.summary = %q{
|
12
|
+
spec.summary = %q{Ruby Scoring API for PMML.}
|
13
13
|
spec.homepage = 'https://github.com/asafschers/scoruby'
|
14
14
|
spec.license = "MIT"
|
15
15
|
|
@@ -22,6 +22,7 @@ Gem::Specification.new do |spec|
|
|
22
22
|
spec.add_development_dependency "rake", "~> 12.0"
|
23
23
|
spec.add_development_dependency "rspec", "~> 3.5"
|
24
24
|
spec.add_development_dependency "pry", "~> 0.10"
|
25
|
+
spec.add_development_dependency "coveralls"
|
25
26
|
spec.add_development_dependency "ruby-prof"
|
26
27
|
spec.add_dependency "nokogiri", "~> 1.7"
|
27
28
|
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: scoruby
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.2.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Asaf Schers
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2017-
|
11
|
+
date: 2017-07-01 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: bundler
|
@@ -66,6 +66,20 @@ dependencies:
|
|
66
66
|
- - "~>"
|
67
67
|
- !ruby/object:Gem::Version
|
68
68
|
version: '0.10'
|
69
|
+
- !ruby/object:Gem::Dependency
|
70
|
+
name: coveralls
|
71
|
+
requirement: !ruby/object:Gem::Requirement
|
72
|
+
requirements:
|
73
|
+
- - ">="
|
74
|
+
- !ruby/object:Gem::Version
|
75
|
+
version: '0'
|
76
|
+
type: :development
|
77
|
+
prerelease: false
|
78
|
+
version_requirements: !ruby/object:Gem::Requirement
|
79
|
+
requirements:
|
80
|
+
- - ">="
|
81
|
+
- !ruby/object:Gem::Version
|
82
|
+
version: '0'
|
69
83
|
- !ruby/object:Gem::Dependency
|
70
84
|
name: ruby-prof
|
71
85
|
requirement: !ruby/object:Gem::Requirement
|
@@ -110,13 +124,10 @@ files:
|
|
110
124
|
- Rakefile
|
111
125
|
- bin/console
|
112
126
|
- bin/setup
|
113
|
-
- lib/gbm/gbm.rb
|
114
|
-
- lib/gbm/gbm_decision_tree.rb
|
115
|
-
- lib/gbm/gbm_node.rb
|
116
|
-
- lib/gbm/gbm_predicate.rb
|
117
127
|
- lib/random_forest/decision_tree.rb
|
128
|
+
- lib/random_forest/features.rb
|
129
|
+
- lib/random_forest/gbm.rb
|
118
130
|
- lib/random_forest/node.rb
|
119
|
-
- lib/random_forest/predicate.rb
|
120
131
|
- lib/random_forest/random_forest.rb
|
121
132
|
- lib/random_forest/simple_predicate.rb
|
122
133
|
- lib/random_forest/simple_set_predicate.rb
|
@@ -148,6 +159,6 @@ rubyforge_project:
|
|
148
159
|
rubygems_version: 2.2.2
|
149
160
|
signing_key:
|
150
161
|
specification_version: 4
|
151
|
-
summary:
|
162
|
+
summary: Ruby Scoring API for PMML.
|
152
163
|
test_files: []
|
153
164
|
has_rdoc:
|
data/lib/gbm/gbm.rb
DELETED
@@ -1,23 +0,0 @@
|
|
1
|
-
require 'gbm_decision_tree'
|
2
|
-
|
3
|
-
class Gbm
|
4
|
-
GBM_FOREST_XPATH = '//Segmentation[@multipleModelMethod="sum"]/Segment'
|
5
|
-
CONST_XPATH = '//Constant[@dataType="double"]'
|
6
|
-
|
7
|
-
def initialize(xml)
|
8
|
-
@decision_trees = xml.xpath(GBM_FOREST_XPATH).collect{ |xml_tree|
|
9
|
-
GbmDecisionTree.new(xml_tree)
|
10
|
-
}
|
11
|
-
@const = Float(xml.xpath(CONST_XPATH).children[0].content)
|
12
|
-
end
|
13
|
-
|
14
|
-
def tree_count
|
15
|
-
@decision_trees.count
|
16
|
-
end
|
17
|
-
|
18
|
-
def score(features)
|
19
|
-
x = @decision_trees.map { |dt| dt.decide(features) }.reduce(:+) + @const
|
20
|
-
Math.exp(x) / (1 + Math.exp(x))
|
21
|
-
end
|
22
|
-
end
|
23
|
-
|
@@ -1,37 +0,0 @@
|
|
1
|
-
require 'gbm_node'
|
2
|
-
|
3
|
-
class GbmDecisionTree
|
4
|
-
attr_reader :root
|
5
|
-
|
6
|
-
def initialize(tree_xml)
|
7
|
-
@id = tree_xml.attribute('id')
|
8
|
-
@root = GbmNode.new(tree_xml.xpath('TreeModel/Node'))
|
9
|
-
end
|
10
|
-
|
11
|
-
def decide(features)
|
12
|
-
curr = @root
|
13
|
-
while curr.score.nil?
|
14
|
-
prev = curr
|
15
|
-
curr = step(curr, features)
|
16
|
-
return if didnt_step?(curr, prev)
|
17
|
-
end
|
18
|
-
|
19
|
-
curr.score
|
20
|
-
end
|
21
|
-
|
22
|
-
private
|
23
|
-
|
24
|
-
def step(curr, features)
|
25
|
-
curr = curr.left if curr.left && curr.left.true?(features)
|
26
|
-
curr = curr.right if curr.right && curr.right.true?(features)
|
27
|
-
curr = curr.missing if curr.missing && curr.missing.true?(features)
|
28
|
-
curr
|
29
|
-
end
|
30
|
-
|
31
|
-
def didnt_step?(curr, prev)
|
32
|
-
return false if (prev.pred != curr.pred)
|
33
|
-
Scoruby.logger.error "Null tree: #{@id}, bad feature: #{curr.left.pred.field }"
|
34
|
-
true
|
35
|
-
end
|
36
|
-
|
37
|
-
end
|
data/lib/gbm/gbm_node.rb
DELETED
@@ -1,23 +0,0 @@
|
|
1
|
-
require 'gbm_predicate'
|
2
|
-
|
3
|
-
class GbmNode
|
4
|
-
|
5
|
-
attr_reader :score, :missing, :left, :right, :pred
|
6
|
-
|
7
|
-
def initialize(xml)
|
8
|
-
children = xml.children
|
9
|
-
@pred = GbmPredicate.new(children[0])
|
10
|
-
|
11
|
-
@score = xml.attribute('score').to_s.to_f unless xml.attribute('score').to_s.empty?
|
12
|
-
|
13
|
-
return if children.count == 1
|
14
|
-
@missing = GbmNode.new(children[1]) if children[1]
|
15
|
-
@left = GbmNode.new(children[2]) if children[2]
|
16
|
-
@right = GbmNode.new(children[3]) if children[3]
|
17
|
-
end
|
18
|
-
|
19
|
-
def true?(features)
|
20
|
-
@pred.nil? || @pred.true?(features)
|
21
|
-
end
|
22
|
-
|
23
|
-
end
|
data/lib/gbm/gbm_predicate.rb
DELETED
@@ -1,18 +0,0 @@
|
|
1
|
-
require 'simple_predicate'
|
2
|
-
require 'simple_set_predicate'
|
3
|
-
|
4
|
-
class GbmPredicate
|
5
|
-
|
6
|
-
def initialize(pred_xml)
|
7
|
-
@pred = SimplePredicate.new(pred_xml) if pred_xml.name == 'SimplePredicate'
|
8
|
-
@pred = SimpleSetPredicate.new(pred_xml) if pred_xml.name == 'SimpleSetPredicate'
|
9
|
-
end
|
10
|
-
|
11
|
-
def field
|
12
|
-
@pred.field
|
13
|
-
end
|
14
|
-
|
15
|
-
def true?(features)
|
16
|
-
@pred.true?(features)
|
17
|
-
end
|
18
|
-
end
|
@@ -1,32 +0,0 @@
|
|
1
|
-
require 'simple_predicate'
|
2
|
-
require 'simple_set_predicate'
|
3
|
-
|
4
|
-
class Predicate
|
5
|
-
|
6
|
-
def initialize(pred_xml)
|
7
|
-
@pred = SimplePredicate.new(pred_xml) if pred_xml.name == 'SimplePredicate'
|
8
|
-
@pred = SimpleSetPredicate.new(pred_xml) if pred_xml.name == 'SimpleSetPredicate'
|
9
|
-
end
|
10
|
-
|
11
|
-
def field
|
12
|
-
@pred.field
|
13
|
-
end
|
14
|
-
|
15
|
-
def true?(features)
|
16
|
-
return if missing_feature?(features)
|
17
|
-
return if nil_feature?(features)
|
18
|
-
@pred.true?(features)
|
19
|
-
end
|
20
|
-
|
21
|
-
def missing_feature?(features)
|
22
|
-
return false if features.has_key? field
|
23
|
-
Scoruby.logger.error "Missing feature #{field}"
|
24
|
-
true
|
25
|
-
end
|
26
|
-
|
27
|
-
def nil_feature?(features)
|
28
|
-
return false unless features[field].nil?
|
29
|
-
Scoruby.logger.error "Feature #{field} value is nil"
|
30
|
-
true
|
31
|
-
end
|
32
|
-
end
|