scoruby 0.2.4 → 0.2.5

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 5ab6779638032408bc0a38812ded29891831aebd
4
- data.tar.gz: d26d4338fdbb2f00a7d6fed974a6e2d577c1d847
3
+ metadata.gz: 6d01f41ef6e3e3485acb65587892402866004bd2
4
+ data.tar.gz: 7d898f2a4c86915e19952a3641286dd252d7c3fd
5
5
  SHA512:
6
- metadata.gz: 080081fe11bdba935bdd99b178ce74a04d1d05dadc080283f9e474a4d84e8415d2a39cf6a9916c4ee46c78fb46965aa934a0850491de04f6fd1d5806aaa32463
7
- data.tar.gz: 38a7ac8547797f2367fcd66036ea87971c531106c0dd0b4c56abf684f48a72db14a9139fa5ff29b0e809aadb5ffb0908a265a603aa6d3ddf5725dc4858cdab0a
6
+ metadata.gz: de6e7acbbcf5acd97f2b980253be94c7eb2fb7c31fb76c9c9d1fe6ca09447751974734370826d1477db639685a6ab37108ced5d85e4fc9b397285e1b61468a4e
7
+ data.tar.gz: 92e5728b151d7c3daa24e5fccebe6b7e8567c92af4fb03cdbc5b5ef20f5a75b53c88b5ef79e6d4db20df157995f45b6222677a0e1634702e3769240b61f025d4
data/.gitignore CHANGED
@@ -3,12 +3,12 @@ coverage
3
3
 
4
4
  *.log
5
5
 
6
+ sample.pmml
7
+
6
8
  *.gem
7
9
 
8
10
  spec/fixtures/decision_tree_v2.pmml
9
11
 
10
- sample.pmml
11
-
12
12
  test_gbm.pmml
13
13
 
14
14
  test_gbm.rb
data/.travis.yml CHANGED
@@ -1,4 +1,4 @@
1
1
  language: ruby
2
2
  rvm:
3
- - 2.1.2
3
+ - 2.4.1
4
4
  before_install: gem install bundler -v 1.10.5
data/README.md CHANGED
@@ -7,7 +7,7 @@
7
7
 
8
8
  Ruby scoring API for Predictive Model Markup Language (PMML).
9
9
 
10
- Currently supports random forest and gradient boosted models.
10
+ Currently supports Decision Tree, Random Forest and Gradient Boosted Models.
11
11
 
12
12
  Will be happy to implement new models by demand, or assist with any other issue.
13
13
 
@@ -31,37 +31,8 @@ Or install it yourself as:
31
31
 
32
32
  ## Usage
33
33
  ### Random Forest
34
- #### Generate PMML - R
35
-
36
- ```R
37
-
38
- # Install and require randomForest, pmml packages
39
-
40
- install.packages('randomForest')
41
- install.packages('pmml')
42
- library('randomForest')
43
- library('pmml')
44
-
45
- # Login to Kaggle and download titanic dataset
46
- # https://www.kaggle.com/c/titanic/data
47
- # Load CSV to data frame -
48
-
49
- titanic.train <- read.table("titanic_train.csv", header = TRUE, sep = ",")
50
- titanic.train$Survived <- as.factor(titanic.train$Survived)
51
-
52
- # Train RF model
53
-
54
- titanic.rf <- randomForest(Survived ~ . - Name - Cabin - Ticket,
55
- data = titanic.train,
56
- na.action = na.roughfix)
57
-
58
- # Generate pmml from model
59
-
60
- pmml <- pmml(titanic.rf)
61
- saveXML(pmml, 'titanic_rf.pmml')
62
-
63
- ```
64
34
 
35
+ [Generate PMML - R](https://github.com/asafschers/scoruby/wiki/Random-Forest)
65
36
  #### Classify by PMML - Ruby
66
37
 
67
38
  ```ruby
@@ -89,34 +60,7 @@ random_forest.decisions_count(features)
89
60
 
90
61
  ### Gradient Boosted model
91
62
 
92
- #### Generate PMML - R
93
-
94
- ```R
95
-
96
- # Install and require gbm, r2pmml
97
-
98
- library("devtools")
99
- install_github(repo = "jpmml/r2pmml")
100
-
101
- library("r2pmml")
102
- library("gbm")
103
-
104
- # Login to Kaggle and download titanic dataset
105
- # https://www.kaggle.com/c/titanic/data
106
- # Load CSV to data frame -
107
-
108
- titanic.train <- read.table("titanic_train.csv", header = TRUE, sep = ",")
109
- titanic.train$Survived <- as.factor(titanic.train$Survived)
110
-
111
- # Train GBM model
112
-
113
- titanic.gbm <- gbm(Survived ~ . - PassengerId - Name - Cabin - Ticket, data = titanic.train)
114
-
115
- # Generate pmml from model
116
-
117
- pmml <- r2pmml(titanic.gbm, 'titanic_gbm.pmml')
118
-
119
- ```
63
+ [Generate PMML - R](https://github.com/asafschers/scoruby/wiki/Gradient-Boosted-Model)
120
64
 
121
65
  #### Classify by PMML - Ruby
122
66
 
@@ -142,8 +86,6 @@ gbm.score(features)
142
86
 
143
87
  ### Decision Tree
144
88
 
145
- #### Classify by PMML - Ruby
146
-
147
89
  ```ruby
148
90
  decision_tree = Scoruby.get_model 'decision_tree.pmml'
149
91
 
@@ -0,0 +1,92 @@
1
+ module Scoruby
2
+ module Models
3
+ class NaiveBayes
4
+ attr_reader :data
5
+
6
+ def initialize(xml)
7
+ @threshold = xml.xpath('//NaiveBayesModel').attr('threshold').value.to_f
8
+ @data = {}
9
+ xml.xpath('//BayesInput').each do |feature|
10
+ @data[feature.attr('fieldName').to_sym] = fetch_feature(feature)
11
+ end
12
+
13
+ @labels = {}
14
+ xml.xpath('//BayesOutput//TargetValueCount').each do |l| l.attr('value')
15
+ @labels[l.attr('value')] = { 'count': l.attr('count').to_f }
16
+ end
17
+ end
18
+
19
+ def lvalues(features)
20
+ @labels.each do |label, _|
21
+ features.each do |feature_name, feature_value|
22
+
23
+ if @data[feature_name][feature_value]
24
+ value_count = @data[feature_name][feature_value][label].to_f
25
+ overall_count = @data[feature_name].sum { |_, value| value[label].to_f }
26
+
27
+ @labels[label][feature_name] = value_count / overall_count
28
+ elsif @data[feature_name][label]
29
+ @labels[label][feature_name] = calc_numerical(@data[feature_name][label], feature_value)
30
+ end
31
+ end
32
+ end
33
+
34
+ lvalues = {}
35
+ @labels.each do |label, label_data|
36
+ label_data.each do |key, value|
37
+ label_data[key] = @threshold if value.round(5).zero?
38
+ end
39
+ lvalues[label] = label_data.values.reduce(:*)
40
+ end
41
+ lvalues
42
+ end
43
+
44
+ def score(features, label)
45
+ lvalues = lvalues(features)
46
+ lvalues[label] / lvalues.values.reduce(:+)
47
+ end
48
+
49
+ private
50
+
51
+ def calc_numerical(label_data, feature_value)
52
+ variance = label_data[:variance].to_f
53
+ mean = label_data[:mean].to_f
54
+ feature_value = feature_value.to_f
55
+
56
+ Math.exp(-(feature_value - mean)**2 / (2 * variance)) / Math.sqrt(2 * Math::PI * variance)
57
+ end
58
+
59
+ def fetch_feature(feature)
60
+ return fetch_numerical_feature(feature) if feature.child.name == 'TargetValueStats'
61
+ fetch_category_feature(feature)
62
+ end
63
+
64
+ def fetch_numerical_feature(feature)
65
+ features_data = {}
66
+ feature.child.children.each do |child|
67
+ features_data[child.attr('value').strip] = {
68
+ mean: child.child.attr('mean'),
69
+ variance: child.child.attr('variance')
70
+ }
71
+ end
72
+ features_data
73
+ end
74
+
75
+ def fetch_category_feature(feature)
76
+ feature_data = {}
77
+ feature.children.each do |category|
78
+ feature_data[category.attr('value')] = fetch_category(category)
79
+ end
80
+ feature_data
81
+ end
82
+
83
+ def fetch_category(category)
84
+ category_data = {}
85
+ category.child.children.each do |label|
86
+ category_data[label.attr('value')] = label.attr('count')
87
+ end
88
+ category_data
89
+ end
90
+ end
91
+ end
92
+ end
@@ -1,3 +1,3 @@
1
1
  module Scoruby
2
- VERSION = '0.2.4'
2
+ VERSION = '0.2.5'
3
3
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: scoruby
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.2.4
4
+ version: 0.2.5
5
5
  platform: ruby
6
6
  authors:
7
7
  - Asaf Schers
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2017-07-22 00:00:00.000000000 Z
11
+ date: 2017-09-14 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: bundler
@@ -131,6 +131,7 @@ files:
131
131
  - lib/scoruby/features.rb
132
132
  - lib/scoruby/models/decision_tree.rb
133
133
  - lib/scoruby/models/gbm.rb
134
+ - lib/scoruby/models/naive_bayes.rb
134
135
  - lib/scoruby/models/random_forest.rb
135
136
  - lib/scoruby/models_factory.rb
136
137
  - lib/scoruby/node.rb