scoruby 0.1.3 → 0.2.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 98eb545b1bb149f22fb5afbaf467d144af1dbb2c
4
- data.tar.gz: dbae0fa59acbf5bb10252bd1971cda46fafb8095
3
+ metadata.gz: f1a4d1fcce322b50113aa532d34ea7397e17dfa7
4
+ data.tar.gz: e39e4e849ac6c98c14625372e1e17219b7ae2999
5
5
  SHA512:
6
- metadata.gz: c572f65fdd04226519f6518aa56ec92d942b8882c700828f2c17447b89cff9286fea243a8f171b2b259b3a15dbce6985ea474ff14a802894fa068546db355ce8
7
- data.tar.gz: 9bd8096adc82636f5a10d6b93d6f0230a7a82b67a755da53153b32dc58c29887a06a31af86235bf2533792bdbccc8b9c77d9677bb3c099c8ab30793aa4696ce4
6
+ metadata.gz: bdd1e14f38a6ab1675c54e7b795bc56f7bab845f6b05f3f8725c1b439737e23471dec7a3f94b60d982f8dbdef349a892d4c045c2f48ff45e0b0c06057599e86e
7
+ data.tar.gz: ec3ae16a4b3fd4b6299f89c925718ad9583fca0d7d27dc45661edc5ea8597a183869a29552aeaa74ddb01af92f421144a89cf1adedf93527e0d0811855c3c0f0
data/README.md CHANGED
@@ -1,10 +1,17 @@
1
1
  <a href="https://codeclimate.com/github/asafschers/scoruby"><img src="https://codeclimate.com/github/asafschers/scoruby/badges/gpa.svg" /></a>
2
- [![Gem Version](https://badge.fury.io/rb/random_forester.svg)](https://badge.fury.io/rb/random_forester)
2
+ [![Coverage Status](https://coveralls.io/repos/github/asafschers/scoruby/badge.svg?branch=master)](https://coveralls.io/github/asafschers/scoruby?branch=master)
3
+ [![Gem Version](https://badge.fury.io/rb/scoruby.svg)](https://badge.fury.io/rb/scoruby)
3
4
  [![Build Status](https://travis-ci.org/asafschers/scoruby.svg?branch=master)](https://travis-ci.org/asafschers/scoruby)
4
5
 
5
6
  # Scoruby
6
7
 
7
- Reads Random Forest PMML files and creates Ruby Random Forest classifier model.
8
+ Ruby scoring API for Predictive Model Markup Language (PMML).
9
+
10
+ Currently supports random forest and gradient boosted models.
11
+
12
+ Will be happy to implement new models by demand, or assist with any other issue.
13
+
14
+ Contact me here or at aschers@gmail.com.
8
15
 
9
16
  ## Installation
10
17
 
@@ -23,14 +30,115 @@ Or install it yourself as:
23
30
  $ gem install scoruby
24
31
 
25
32
  ## Usage
33
+ ### Random Forest
34
+ #### Generate PMML - R
35
+
36
+ ```R
37
+
38
+ # Install and require randomForest, pmml packages
39
+
40
+ install.packages('randomForest')
41
+ install.packages('pmml')
42
+ library('randomForest')
43
+ library('pmml')
44
+
45
+ # Login to Kaggle and download titanic dataset
46
+ # https://www.kaggle.com/c/titanic/data
47
+ # Load CSV to data frame -
48
+
49
+ titanic.train <- read.table("titanic_train.csv", header = TRUE, sep = ",")
50
+ titanic.train$Survived <- as.factor(titanic.train$Survived)
51
+
52
+ # Train RF model
53
+
54
+ titanic.rf <- randomForest(Survived ~ . - Name - Cabin - Ticket,
55
+ data = titanic.train,
56
+ na.action = na.roughfix)
57
+
58
+ # Generate pmml from model
59
+
60
+ pmml <- pmml(titanic.rf)
61
+ saveXML(pmml, 'titanic_rf.pmml')
62
+
63
+ ```
64
+
65
+ #### Classify by PMML - Ruby
26
66
 
27
67
  ```ruby
28
- random_forest = Scourby.get_model 'rf.pmml'
29
- features = {a: 1, b: true, c: "YES"}
68
+
69
+ random_forest = Scoruby.get_model 'titanic_rf.pmml'
70
+ features = {
71
+ Sex: 'male',
72
+ Parch: 0,
73
+ Age: 30,
74
+ Fare: 9.6875,
75
+ Pclass: 2,
76
+ SibSp: 0,
77
+ Embarked: 'Q'
78
+ }
79
+
30
80
  random_forest.predict(features)
81
+
82
+ => "0"
83
+
31
84
  random_forest.decisions_count(features)
85
+
86
+ => {"0"=>441, "1"=>59}
87
+
32
88
  ```
33
89
 
90
+ ### Gradient Boosted model
91
+
92
+ #### Generate PMML - R
93
+
94
+ ```R
95
+
96
+ # Install and require gbm, r2pmml
97
+
98
+ library("devtools")
99
+ install_github(repo = "jpmml/r2pmml")
100
+
101
+ library("r2pmml")
102
+ library("gbm")
103
+
104
+ # Login to Kaggle and download titanic dataset
105
+ # https://www.kaggle.com/c/titanic/data
106
+ # Load CSV to data frame -
107
+
108
+ titanic.train <- read.table("titanic_train.csv", header = TRUE, sep = ",")
109
+ titanic.train$Survived <- as.factor(titanic.train$Survived)
110
+
111
+ # Train GBM model
112
+
113
+ titanic.gbm <- gbm(Survived ~ . - PassengerId - Name - Cabin - Ticket, data = titanic.train)
114
+
115
+ # Generate pmml from model
116
+
117
+ pmml <- r2pmml(titanic.gbm, 'titanic_gbm.pmml')
118
+
119
+ ```
120
+
121
+ #### Classify by PMML - Ruby
122
+
123
+ ```ruby
124
+
125
+ gbm = Scoruby.get_model 'gbm.pmml'
126
+
127
+ features = {
128
+ Sex: 'male',
129
+ Parch: 0,
130
+ Age: 30,
131
+ Fare: 9.6875,
132
+ Pclass: 2,
133
+ SibSp: 0,
134
+ Embarked: 'Q'
135
+ }
136
+
137
+ gbm.score(features)
138
+
139
+ => 0.3652639329522468
140
+
141
+ ```
34
142
 
35
143
  ## Development
36
144
 
@@ -23,14 +23,20 @@ class DecisionTree
23
23
  private
24
24
 
25
25
  def step(curr, features)
26
- curr = curr.left if curr.left && curr.left.true?(features)
27
- curr = curr.right if curr.right && curr.right.true?(features)
26
+ curr = step_on_true(curr, features, 0)
27
+ curr = step_on_true(curr, features, 1)
28
+ curr = step_on_true(curr, features, 2)
29
+ curr
30
+ end
31
+
32
+ def step_on_true(curr, features, num)
33
+ return curr.children[num] if curr.children && curr.children[num] && curr.children[num].true?(features)
28
34
  curr
29
35
  end
30
36
 
31
37
  def didnt_step?(curr, prev)
32
38
  return false if (prev.pred != curr.pred)
33
- Scoruby.logger.error "Null tree: #{@id}, bad feature: #{curr.left.pred.field }"
39
+ Scoruby.logger.error "Null tree: #{@id}, bad feature: #{curr.children[0].pred.field }"
34
40
  true
35
41
  end
36
42
  end
@@ -0,0 +1,16 @@
1
+ class Features
2
+
3
+ attr_reader :formatted
4
+
5
+ def initialize(features)
6
+ @formatted = format_booleans(features)
7
+ end
8
+
9
+ def format_booleans(features)
10
+ features.map { |k, v|
11
+ features[k] = 'f' if v == false
12
+ features[k] = 't' if v == true
13
+ }
14
+ features
15
+ end
16
+ end
@@ -0,0 +1,29 @@
1
+ require 'decision_tree'
2
+ require 'features'
3
+
4
+ class Gbm
5
+ GBM_FOREST_XPATH = '//Segmentation[@multipleModelMethod="sum"]/Segment'
6
+ CONST_XPATH = '//Target/@rescaleConstant'
7
+
8
+ def initialize(xml)
9
+ @decision_trees = xml.xpath(GBM_FOREST_XPATH).collect{ |xml_tree|
10
+ DecisionTree.new(xml_tree)
11
+ }
12
+ @const = Float(xml.xpath(CONST_XPATH).to_s)
13
+ end
14
+
15
+ def tree_count
16
+ @decision_trees.count
17
+ end
18
+
19
+ def score(features)
20
+ formatted_features = Features.new(features).formatted
21
+ x = @decision_trees.map { |dt|
22
+ score = dt.decide(formatted_features)
23
+ score.to_s.to_f
24
+ }.reduce(:+) + @const
25
+ Math.exp(x) / (1 + Math.exp(x))
26
+ end
27
+
28
+ end
29
+
@@ -1,18 +1,23 @@
1
- require 'predicate'
1
+ require 'simple_predicate'
2
+ require 'simple_set_predicate'
2
3
 
3
4
  class Node
4
5
 
5
- attr_reader :decision, :left, :right, :pred
6
+ attr_reader :decision, :pred, :children
6
7
 
7
8
  def initialize(xml)
8
9
  children = xml.children
9
- @pred = Predicate.new(children[0])
10
-
10
+ pred_xml = children[0]
11
+ @pred = SimplePredicate.new(pred_xml) if pred_xml.name == 'SimplePredicate'
12
+ @pred = SimpleSetPredicate.new(pred_xml) if pred_xml.name == 'SimpleSetPredicate'
13
+ @children = []
11
14
  @decision = xml.attribute('score').to_s
12
15
 
13
16
  return if children.count == 1
14
- @left = Node.new(children[1]) if children[1]
15
- @right = Node.new(children[2]) if children[2]
17
+
18
+ @children << Node.new(children[1]) if children[1]
19
+ @children << Node.new(children[2]) if children[2]
20
+ @children << Node.new(children[3]) if children[3]
16
21
  end
17
22
 
18
23
  def true?(features)
@@ -11,8 +11,9 @@ class RandomForest
11
11
  end
12
12
 
13
13
  def decisions_count(features)
14
+ formatted_features = Features.new(features).formatted
14
15
  decisions = @decision_trees.collect { |decision_tree|
15
- decision_tree.decide(features)
16
+ decision_tree.decide(formatted_features)
16
17
  }
17
18
  decisions.inject(Hash.new(0)) { |h, e| h[e] += 1 ; h }
18
19
  end
@@ -20,9 +20,7 @@ class SimplePredicate
20
20
  end
21
21
 
22
22
  def true?(features)
23
- format_boolean(features)
24
23
  return num_true?(features) if MATH_OPS.include?(@operator)
25
-
26
24
  return features[@field] == @value if @operator == EQUAL
27
25
  features[field].nil? || !features.has_key?(field) if @operator == IS_MISSING
28
26
  end
@@ -36,9 +34,4 @@ class SimplePredicate
36
34
  return curr_value <= value if @operator == LESS_OR_EQUAL
37
35
  curr_value >= value if @operator == GREATER_OR_EQUAL
38
36
  end
39
-
40
- def format_boolean(features)
41
- features[@field] = 'f' if features[@field] == false
42
- features[@field] = 't' if features[@field] == true
43
- end
44
37
  end
@@ -12,7 +12,6 @@ class SimpleSetPredicate
12
12
  end
13
13
 
14
14
  def true?(features)
15
- format_boolean(features)
16
15
  @array.include? features[@field] if @operator == IS_IN
17
16
  end
18
17
 
@@ -23,9 +22,4 @@ class SimpleSetPredicate
23
22
  reject(&:empty?).
24
23
  map { |w| w.tr('"','')}
25
24
  end
26
-
27
- def format_boolean(features)
28
- features[@field] = 'f' if features[@field] == false
29
- features[@field] = 't' if features[@field] == true
30
- end
31
25
  end
@@ -1,3 +1,3 @@
1
1
  module Scoruby
2
- VERSION = '0.1.3'
2
+ VERSION = '0.2.0'
3
3
  end
data/scoruby.gemspec CHANGED
@@ -9,7 +9,7 @@ Gem::Specification.new do |spec|
9
9
  spec.authors = ["Asaf Schers"]
10
10
  spec.email = ["schers@riskified.com"]
11
11
 
12
- spec.summary = %q{Creates a random forest object from a pmml file.}
12
+ spec.summary = %q{Ruby Scoring API for PMML.}
13
13
  spec.homepage = 'https://github.com/asafschers/scoruby'
14
14
  spec.license = "MIT"
15
15
 
@@ -22,6 +22,7 @@ Gem::Specification.new do |spec|
22
22
  spec.add_development_dependency "rake", "~> 12.0"
23
23
  spec.add_development_dependency "rspec", "~> 3.5"
24
24
  spec.add_development_dependency "pry", "~> 0.10"
25
+ spec.add_development_dependency "coveralls"
25
26
  spec.add_development_dependency "ruby-prof"
26
27
  spec.add_dependency "nokogiri", "~> 1.7"
27
28
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: scoruby
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.3
4
+ version: 0.2.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Asaf Schers
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2017-04-21 00:00:00.000000000 Z
11
+ date: 2017-07-01 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: bundler
@@ -66,6 +66,20 @@ dependencies:
66
66
  - - "~>"
67
67
  - !ruby/object:Gem::Version
68
68
  version: '0.10'
69
+ - !ruby/object:Gem::Dependency
70
+ name: coveralls
71
+ requirement: !ruby/object:Gem::Requirement
72
+ requirements:
73
+ - - ">="
74
+ - !ruby/object:Gem::Version
75
+ version: '0'
76
+ type: :development
77
+ prerelease: false
78
+ version_requirements: !ruby/object:Gem::Requirement
79
+ requirements:
80
+ - - ">="
81
+ - !ruby/object:Gem::Version
82
+ version: '0'
69
83
  - !ruby/object:Gem::Dependency
70
84
  name: ruby-prof
71
85
  requirement: !ruby/object:Gem::Requirement
@@ -110,13 +124,10 @@ files:
110
124
  - Rakefile
111
125
  - bin/console
112
126
  - bin/setup
113
- - lib/gbm/gbm.rb
114
- - lib/gbm/gbm_decision_tree.rb
115
- - lib/gbm/gbm_node.rb
116
- - lib/gbm/gbm_predicate.rb
117
127
  - lib/random_forest/decision_tree.rb
128
+ - lib/random_forest/features.rb
129
+ - lib/random_forest/gbm.rb
118
130
  - lib/random_forest/node.rb
119
- - lib/random_forest/predicate.rb
120
131
  - lib/random_forest/random_forest.rb
121
132
  - lib/random_forest/simple_predicate.rb
122
133
  - lib/random_forest/simple_set_predicate.rb
@@ -148,6 +159,6 @@ rubyforge_project:
148
159
  rubygems_version: 2.2.2
149
160
  signing_key:
150
161
  specification_version: 4
151
- summary: Creates a random forest object from a pmml file.
162
+ summary: Ruby Scoring API for PMML.
152
163
  test_files: []
153
164
  has_rdoc:
data/lib/gbm/gbm.rb DELETED
@@ -1,23 +0,0 @@
1
- require 'gbm_decision_tree'
2
-
3
- class Gbm
4
- GBM_FOREST_XPATH = '//Segmentation[@multipleModelMethod="sum"]/Segment'
5
- CONST_XPATH = '//Constant[@dataType="double"]'
6
-
7
- def initialize(xml)
8
- @decision_trees = xml.xpath(GBM_FOREST_XPATH).collect{ |xml_tree|
9
- GbmDecisionTree.new(xml_tree)
10
- }
11
- @const = Float(xml.xpath(CONST_XPATH).children[0].content)
12
- end
13
-
14
- def tree_count
15
- @decision_trees.count
16
- end
17
-
18
- def score(features)
19
- x = @decision_trees.map { |dt| dt.decide(features) }.reduce(:+) + @const
20
- Math.exp(x) / (1 + Math.exp(x))
21
- end
22
- end
23
-
@@ -1,37 +0,0 @@
1
- require 'gbm_node'
2
-
3
- class GbmDecisionTree
4
- attr_reader :root
5
-
6
- def initialize(tree_xml)
7
- @id = tree_xml.attribute('id')
8
- @root = GbmNode.new(tree_xml.xpath('TreeModel/Node'))
9
- end
10
-
11
- def decide(features)
12
- curr = @root
13
- while curr.score.nil?
14
- prev = curr
15
- curr = step(curr, features)
16
- return if didnt_step?(curr, prev)
17
- end
18
-
19
- curr.score
20
- end
21
-
22
- private
23
-
24
- def step(curr, features)
25
- curr = curr.left if curr.left && curr.left.true?(features)
26
- curr = curr.right if curr.right && curr.right.true?(features)
27
- curr = curr.missing if curr.missing && curr.missing.true?(features)
28
- curr
29
- end
30
-
31
- def didnt_step?(curr, prev)
32
- return false if (prev.pred != curr.pred)
33
- Scoruby.logger.error "Null tree: #{@id}, bad feature: #{curr.left.pred.field }"
34
- true
35
- end
36
-
37
- end
data/lib/gbm/gbm_node.rb DELETED
@@ -1,23 +0,0 @@
1
- require 'gbm_predicate'
2
-
3
- class GbmNode
4
-
5
- attr_reader :score, :missing, :left, :right, :pred
6
-
7
- def initialize(xml)
8
- children = xml.children
9
- @pred = GbmPredicate.new(children[0])
10
-
11
- @score = xml.attribute('score').to_s.to_f unless xml.attribute('score').to_s.empty?
12
-
13
- return if children.count == 1
14
- @missing = GbmNode.new(children[1]) if children[1]
15
- @left = GbmNode.new(children[2]) if children[2]
16
- @right = GbmNode.new(children[3]) if children[3]
17
- end
18
-
19
- def true?(features)
20
- @pred.nil? || @pred.true?(features)
21
- end
22
-
23
- end
@@ -1,18 +0,0 @@
1
- require 'simple_predicate'
2
- require 'simple_set_predicate'
3
-
4
- class GbmPredicate
5
-
6
- def initialize(pred_xml)
7
- @pred = SimplePredicate.new(pred_xml) if pred_xml.name == 'SimplePredicate'
8
- @pred = SimpleSetPredicate.new(pred_xml) if pred_xml.name == 'SimpleSetPredicate'
9
- end
10
-
11
- def field
12
- @pred.field
13
- end
14
-
15
- def true?(features)
16
- @pred.true?(features)
17
- end
18
- end
@@ -1,32 +0,0 @@
1
- require 'simple_predicate'
2
- require 'simple_set_predicate'
3
-
4
- class Predicate
5
-
6
- def initialize(pred_xml)
7
- @pred = SimplePredicate.new(pred_xml) if pred_xml.name == 'SimplePredicate'
8
- @pred = SimpleSetPredicate.new(pred_xml) if pred_xml.name == 'SimpleSetPredicate'
9
- end
10
-
11
- def field
12
- @pred.field
13
- end
14
-
15
- def true?(features)
16
- return if missing_feature?(features)
17
- return if nil_feature?(features)
18
- @pred.true?(features)
19
- end
20
-
21
- def missing_feature?(features)
22
- return false if features.has_key? field
23
- Scoruby.logger.error "Missing feature #{field}"
24
- true
25
- end
26
-
27
- def nil_feature?(features)
28
- return false unless features[field].nil?
29
- Scoruby.logger.error "Feature #{field} value is nil"
30
- true
31
- end
32
- end