scoruby 0.1.3 → 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 98eb545b1bb149f22fb5afbaf467d144af1dbb2c
4
- data.tar.gz: dbae0fa59acbf5bb10252bd1971cda46fafb8095
3
+ metadata.gz: f1a4d1fcce322b50113aa532d34ea7397e17dfa7
4
+ data.tar.gz: e39e4e849ac6c98c14625372e1e17219b7ae2999
5
5
  SHA512:
6
- metadata.gz: c572f65fdd04226519f6518aa56ec92d942b8882c700828f2c17447b89cff9286fea243a8f171b2b259b3a15dbce6985ea474ff14a802894fa068546db355ce8
7
- data.tar.gz: 9bd8096adc82636f5a10d6b93d6f0230a7a82b67a755da53153b32dc58c29887a06a31af86235bf2533792bdbccc8b9c77d9677bb3c099c8ab30793aa4696ce4
6
+ metadata.gz: bdd1e14f38a6ab1675c54e7b795bc56f7bab845f6b05f3f8725c1b439737e23471dec7a3f94b60d982f8dbdef349a892d4c045c2f48ff45e0b0c06057599e86e
7
+ data.tar.gz: ec3ae16a4b3fd4b6299f89c925718ad9583fca0d7d27dc45661edc5ea8597a183869a29552aeaa74ddb01af92f421144a89cf1adedf93527e0d0811855c3c0f0
data/README.md CHANGED
@@ -1,10 +1,17 @@
1
1
  <a href="https://codeclimate.com/github/asafschers/scoruby"><img src="https://codeclimate.com/github/asafschers/scoruby/badges/gpa.svg" /></a>
2
- [![Gem Version](https://badge.fury.io/rb/random_forester.svg)](https://badge.fury.io/rb/random_forester)
2
+ [![Coverage Status](https://coveralls.io/repos/github/asafschers/scoruby/badge.svg?branch=master)](https://coveralls.io/github/asafschers/scoruby?branch=master)
3
+ [![Gem Version](https://badge.fury.io/rb/scoruby.svg)](https://badge.fury.io/rb/scoruby)
3
4
  [![Build Status](https://travis-ci.org/asafschers/scoruby.svg?branch=master)](https://travis-ci.org/asafschers/scoruby)
4
5
 
5
6
  # Scoruby
6
7
 
7
- Reads Random Forest PMML files and creates Ruby Random Forest classifier model.
8
+ Ruby scoring API for Predictive Model Markup Language (PMML).
9
+
10
+ Currently supports random forest and gradient boosted models.
11
+
12
+ Will be happy to implement new models by demand, or assist with any other issue.
13
+
14
+ Contact me here or at aschers@gmail.com.
8
15
 
9
16
  ## Installation
10
17
 
@@ -23,14 +30,115 @@ Or install it yourself as:
23
30
  $ gem install scoruby
24
31
 
25
32
  ## Usage
33
+ ### Random Forest
34
+ #### Generate PMML - R
35
+
36
+ ```R
37
+
38
+ # Install and require randomForest, pmml packages
39
+
40
+ install.packages('randomForest')
41
+ install.packages('pmml')
42
+ library('randomForest')
43
+ library('pmml')
44
+
45
+ # Login to Kaggle and download titanic dataset
46
+ # https://www.kaggle.com/c/titanic/data
47
+ # Load CSV to data frame -
48
+
49
+ titanic.train <- read.table("titanic_train.csv", header = TRUE, sep = ",")
50
+ titanic.train$Survived <- as.factor(titanic.train$Survived)
51
+
52
+ # Train RF model
53
+
54
+ titanic.rf <- randomForest(Survived ~ . - Name - Cabin - Ticket,
55
+ data = titanic.train,
56
+ na.action = na.roughfix)
57
+
58
+ # Generate pmml from model
59
+
60
+ pmml <- pmml(titanic.rf)
61
+ saveXML(pmml, 'titanic_rf.pmml')
62
+
63
+ ```
64
+
65
+ #### Classify by PMML - Ruby
26
66
 
27
67
  ```ruby
28
- random_forest = Scourby.get_model 'rf.pmml'
29
- features = {a: 1, b: true, c: "YES"}
68
+
69
+ random_forest = Scoruby.get_model 'titanic_rf.pmml'
70
+ features = {
71
+ Sex: 'male',
72
+ Parch: 0,
73
+ Age: 30,
74
+ Fare: 9.6875,
75
+ Pclass: 2,
76
+ SibSp: 0,
77
+ Embarked: 'Q'
78
+ }
79
+
30
80
  random_forest.predict(features)
81
+
82
+ => "0"
83
+
31
84
  random_forest.decisions_count(features)
85
+
86
+ => {"0"=>441, "1"=>59}
87
+
32
88
  ```
33
89
 
90
+ ### Gradient Boosted model
91
+
92
+ #### Generate PMML - R
93
+
94
+ ```R
95
+
96
+ # Install and require gbm, r2pmml
97
+
98
+ library("devtools")
99
+ install_github(repo = "jpmml/r2pmml")
100
+
101
+ library("r2pmml")
102
+ library("gbm")
103
+
104
+ # Login to Kaggle and download titanic dataset
105
+ # https://www.kaggle.com/c/titanic/data
106
+ # Load CSV to data frame -
107
+
108
+ titanic.train <- read.table("titanic_train.csv", header = TRUE, sep = ",")
109
+ titanic.train$Survived <- as.factor(titanic.train$Survived)
110
+
111
+ # Train GBM model
112
+
113
+ titanic.gbm <- gbm(Survived ~ . - PassengerId - Name - Cabin - Ticket, data = titanic.train)
114
+
115
+ # Generate pmml from model
116
+
117
+ pmml <- r2pmml(titanic.gbm, 'titanic_gbm.pmml')
118
+
119
+ ```
120
+
121
+ #### Classify by PMML - Ruby
122
+
123
+ ```ruby
124
+
125
+ gbm = Scoruby.get_model 'gbm.pmml'
126
+
127
+ features = {
128
+ Sex: 'male',
129
+ Parch: 0,
130
+ Age: 30,
131
+ Fare: 9.6875,
132
+ Pclass: 2,
133
+ SibSp: 0,
134
+ Embarked: 'Q'
135
+ }
136
+
137
+ gbm.score(features)
138
+
139
+ => 0.3652639329522468
140
+
141
+ ```
34
142
 
35
143
  ## Development
36
144
 
@@ -23,14 +23,20 @@ class DecisionTree
23
23
  private
24
24
 
25
25
  def step(curr, features)
26
- curr = curr.left if curr.left && curr.left.true?(features)
27
- curr = curr.right if curr.right && curr.right.true?(features)
26
+ curr = step_on_true(curr, features, 0)
27
+ curr = step_on_true(curr, features, 1)
28
+ curr = step_on_true(curr, features, 2)
29
+ curr
30
+ end
31
+
32
+ def step_on_true(curr, features, num)
33
+ return curr.children[num] if curr.children && curr.children[num] && curr.children[num].true?(features)
28
34
  curr
29
35
  end
30
36
 
31
37
  def didnt_step?(curr, prev)
32
38
  return false if (prev.pred != curr.pred)
33
- Scoruby.logger.error "Null tree: #{@id}, bad feature: #{curr.left.pred.field }"
39
+ Scoruby.logger.error "Null tree: #{@id}, bad feature: #{curr.children[0].pred.field }"
34
40
  true
35
41
  end
36
42
  end
@@ -0,0 +1,16 @@
1
+ class Features
2
+
3
+ attr_reader :formatted
4
+
5
+ def initialize(features)
6
+ @formatted = format_booleans(features)
7
+ end
8
+
9
+ def format_booleans(features)
10
+ features.map { |k, v|
11
+ features[k] = 'f' if v == false
12
+ features[k] = 't' if v == true
13
+ }
14
+ features
15
+ end
16
+ end
@@ -0,0 +1,29 @@
1
+ require 'decision_tree'
2
+ require 'features'
3
+
4
+ class Gbm
5
+ GBM_FOREST_XPATH = '//Segmentation[@multipleModelMethod="sum"]/Segment'
6
+ CONST_XPATH = '//Target/@rescaleConstant'
7
+
8
+ def initialize(xml)
9
+ @decision_trees = xml.xpath(GBM_FOREST_XPATH).collect{ |xml_tree|
10
+ DecisionTree.new(xml_tree)
11
+ }
12
+ @const = Float(xml.xpath(CONST_XPATH).to_s)
13
+ end
14
+
15
+ def tree_count
16
+ @decision_trees.count
17
+ end
18
+
19
+ def score(features)
20
+ formatted_features = Features.new(features).formatted
21
+ x = @decision_trees.map { |dt|
22
+ score = dt.decide(formatted_features)
23
+ score.to_s.to_f
24
+ }.reduce(:+) + @const
25
+ Math.exp(x) / (1 + Math.exp(x))
26
+ end
27
+
28
+ end
29
+
@@ -1,18 +1,23 @@
1
- require 'predicate'
1
+ require 'simple_predicate'
2
+ require 'simple_set_predicate'
2
3
 
3
4
  class Node
4
5
 
5
- attr_reader :decision, :left, :right, :pred
6
+ attr_reader :decision, :pred, :children
6
7
 
7
8
  def initialize(xml)
8
9
  children = xml.children
9
- @pred = Predicate.new(children[0])
10
-
10
+ pred_xml = children[0]
11
+ @pred = SimplePredicate.new(pred_xml) if pred_xml.name == 'SimplePredicate'
12
+ @pred = SimpleSetPredicate.new(pred_xml) if pred_xml.name == 'SimpleSetPredicate'
13
+ @children = []
11
14
  @decision = xml.attribute('score').to_s
12
15
 
13
16
  return if children.count == 1
14
- @left = Node.new(children[1]) if children[1]
15
- @right = Node.new(children[2]) if children[2]
17
+
18
+ @children << Node.new(children[1]) if children[1]
19
+ @children << Node.new(children[2]) if children[2]
20
+ @children << Node.new(children[3]) if children[3]
16
21
  end
17
22
 
18
23
  def true?(features)
@@ -11,8 +11,9 @@ class RandomForest
11
11
  end
12
12
 
13
13
  def decisions_count(features)
14
+ formatted_features = Features.new(features).formatted
14
15
  decisions = @decision_trees.collect { |decision_tree|
15
- decision_tree.decide(features)
16
+ decision_tree.decide(formatted_features)
16
17
  }
17
18
  decisions.inject(Hash.new(0)) { |h, e| h[e] += 1 ; h }
18
19
  end
@@ -20,9 +20,7 @@ class SimplePredicate
20
20
  end
21
21
 
22
22
  def true?(features)
23
- format_boolean(features)
24
23
  return num_true?(features) if MATH_OPS.include?(@operator)
25
-
26
24
  return features[@field] == @value if @operator == EQUAL
27
25
  features[field].nil? || !features.has_key?(field) if @operator == IS_MISSING
28
26
  end
@@ -36,9 +34,4 @@ class SimplePredicate
36
34
  return curr_value <= value if @operator == LESS_OR_EQUAL
37
35
  curr_value >= value if @operator == GREATER_OR_EQUAL
38
36
  end
39
-
40
- def format_boolean(features)
41
- features[@field] = 'f' if features[@field] == false
42
- features[@field] = 't' if features[@field] == true
43
- end
44
37
  end
@@ -12,7 +12,6 @@ class SimpleSetPredicate
12
12
  end
13
13
 
14
14
  def true?(features)
15
- format_boolean(features)
16
15
  @array.include? features[@field] if @operator == IS_IN
17
16
  end
18
17
 
@@ -23,9 +22,4 @@ class SimpleSetPredicate
23
22
  reject(&:empty?).
24
23
  map { |w| w.tr('"','')}
25
24
  end
26
-
27
- def format_boolean(features)
28
- features[@field] = 'f' if features[@field] == false
29
- features[@field] = 't' if features[@field] == true
30
- end
31
25
  end
@@ -1,3 +1,3 @@
1
1
  module Scoruby
2
- VERSION = '0.1.3'
2
+ VERSION = '0.2.0'
3
3
  end
data/scoruby.gemspec CHANGED
@@ -9,7 +9,7 @@ Gem::Specification.new do |spec|
9
9
  spec.authors = ["Asaf Schers"]
10
10
  spec.email = ["schers@riskified.com"]
11
11
 
12
- spec.summary = %q{Creates a random forest object from a pmml file.}
12
+ spec.summary = %q{Ruby Scoring API for PMML.}
13
13
  spec.homepage = 'https://github.com/asafschers/scoruby'
14
14
  spec.license = "MIT"
15
15
 
@@ -22,6 +22,7 @@ Gem::Specification.new do |spec|
22
22
  spec.add_development_dependency "rake", "~> 12.0"
23
23
  spec.add_development_dependency "rspec", "~> 3.5"
24
24
  spec.add_development_dependency "pry", "~> 0.10"
25
+ spec.add_development_dependency "coveralls"
25
26
  spec.add_development_dependency "ruby-prof"
26
27
  spec.add_dependency "nokogiri", "~> 1.7"
27
28
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: scoruby
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.3
4
+ version: 0.2.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Asaf Schers
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2017-04-21 00:00:00.000000000 Z
11
+ date: 2017-07-01 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: bundler
@@ -66,6 +66,20 @@ dependencies:
66
66
  - - "~>"
67
67
  - !ruby/object:Gem::Version
68
68
  version: '0.10'
69
+ - !ruby/object:Gem::Dependency
70
+ name: coveralls
71
+ requirement: !ruby/object:Gem::Requirement
72
+ requirements:
73
+ - - ">="
74
+ - !ruby/object:Gem::Version
75
+ version: '0'
76
+ type: :development
77
+ prerelease: false
78
+ version_requirements: !ruby/object:Gem::Requirement
79
+ requirements:
80
+ - - ">="
81
+ - !ruby/object:Gem::Version
82
+ version: '0'
69
83
  - !ruby/object:Gem::Dependency
70
84
  name: ruby-prof
71
85
  requirement: !ruby/object:Gem::Requirement
@@ -110,13 +124,10 @@ files:
110
124
  - Rakefile
111
125
  - bin/console
112
126
  - bin/setup
113
- - lib/gbm/gbm.rb
114
- - lib/gbm/gbm_decision_tree.rb
115
- - lib/gbm/gbm_node.rb
116
- - lib/gbm/gbm_predicate.rb
117
127
  - lib/random_forest/decision_tree.rb
128
+ - lib/random_forest/features.rb
129
+ - lib/random_forest/gbm.rb
118
130
  - lib/random_forest/node.rb
119
- - lib/random_forest/predicate.rb
120
131
  - lib/random_forest/random_forest.rb
121
132
  - lib/random_forest/simple_predicate.rb
122
133
  - lib/random_forest/simple_set_predicate.rb
@@ -148,6 +159,6 @@ rubyforge_project:
148
159
  rubygems_version: 2.2.2
149
160
  signing_key:
150
161
  specification_version: 4
151
- summary: Creates a random forest object from a pmml file.
162
+ summary: Ruby Scoring API for PMML.
152
163
  test_files: []
153
164
  has_rdoc:
data/lib/gbm/gbm.rb DELETED
@@ -1,23 +0,0 @@
1
- require 'gbm_decision_tree'
2
-
3
- class Gbm
4
- GBM_FOREST_XPATH = '//Segmentation[@multipleModelMethod="sum"]/Segment'
5
- CONST_XPATH = '//Constant[@dataType="double"]'
6
-
7
- def initialize(xml)
8
- @decision_trees = xml.xpath(GBM_FOREST_XPATH).collect{ |xml_tree|
9
- GbmDecisionTree.new(xml_tree)
10
- }
11
- @const = Float(xml.xpath(CONST_XPATH).children[0].content)
12
- end
13
-
14
- def tree_count
15
- @decision_trees.count
16
- end
17
-
18
- def score(features)
19
- x = @decision_trees.map { |dt| dt.decide(features) }.reduce(:+) + @const
20
- Math.exp(x) / (1 + Math.exp(x))
21
- end
22
- end
23
-
@@ -1,37 +0,0 @@
1
- require 'gbm_node'
2
-
3
- class GbmDecisionTree
4
- attr_reader :root
5
-
6
- def initialize(tree_xml)
7
- @id = tree_xml.attribute('id')
8
- @root = GbmNode.new(tree_xml.xpath('TreeModel/Node'))
9
- end
10
-
11
- def decide(features)
12
- curr = @root
13
- while curr.score.nil?
14
- prev = curr
15
- curr = step(curr, features)
16
- return if didnt_step?(curr, prev)
17
- end
18
-
19
- curr.score
20
- end
21
-
22
- private
23
-
24
- def step(curr, features)
25
- curr = curr.left if curr.left && curr.left.true?(features)
26
- curr = curr.right if curr.right && curr.right.true?(features)
27
- curr = curr.missing if curr.missing && curr.missing.true?(features)
28
- curr
29
- end
30
-
31
- def didnt_step?(curr, prev)
32
- return false if (prev.pred != curr.pred)
33
- Scoruby.logger.error "Null tree: #{@id}, bad feature: #{curr.left.pred.field }"
34
- true
35
- end
36
-
37
- end
data/lib/gbm/gbm_node.rb DELETED
@@ -1,23 +0,0 @@
1
- require 'gbm_predicate'
2
-
3
- class GbmNode
4
-
5
- attr_reader :score, :missing, :left, :right, :pred
6
-
7
- def initialize(xml)
8
- children = xml.children
9
- @pred = GbmPredicate.new(children[0])
10
-
11
- @score = xml.attribute('score').to_s.to_f unless xml.attribute('score').to_s.empty?
12
-
13
- return if children.count == 1
14
- @missing = GbmNode.new(children[1]) if children[1]
15
- @left = GbmNode.new(children[2]) if children[2]
16
- @right = GbmNode.new(children[3]) if children[3]
17
- end
18
-
19
- def true?(features)
20
- @pred.nil? || @pred.true?(features)
21
- end
22
-
23
- end
@@ -1,18 +0,0 @@
1
- require 'simple_predicate'
2
- require 'simple_set_predicate'
3
-
4
- class GbmPredicate
5
-
6
- def initialize(pred_xml)
7
- @pred = SimplePredicate.new(pred_xml) if pred_xml.name == 'SimplePredicate'
8
- @pred = SimpleSetPredicate.new(pred_xml) if pred_xml.name == 'SimpleSetPredicate'
9
- end
10
-
11
- def field
12
- @pred.field
13
- end
14
-
15
- def true?(features)
16
- @pred.true?(features)
17
- end
18
- end
@@ -1,32 +0,0 @@
1
- require 'simple_predicate'
2
- require 'simple_set_predicate'
3
-
4
- class Predicate
5
-
6
- def initialize(pred_xml)
7
- @pred = SimplePredicate.new(pred_xml) if pred_xml.name == 'SimplePredicate'
8
- @pred = SimpleSetPredicate.new(pred_xml) if pred_xml.name == 'SimpleSetPredicate'
9
- end
10
-
11
- def field
12
- @pred.field
13
- end
14
-
15
- def true?(features)
16
- return if missing_feature?(features)
17
- return if nil_feature?(features)
18
- @pred.true?(features)
19
- end
20
-
21
- def missing_feature?(features)
22
- return false if features.has_key? field
23
- Scoruby.logger.error "Missing feature #{field}"
24
- true
25
- end
26
-
27
- def nil_feature?(features)
28
- return false unless features[field].nil?
29
- Scoruby.logger.error "Feature #{field} value is nil"
30
- true
31
- end
32
- end