eps 0.2.1 → 0.3.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +14 -0
- data/LICENSE.txt +1 -1
- data/README.md +183 -243
- data/lib/eps.rb +27 -3
- data/lib/eps/base_estimator.rb +316 -47
- data/lib/eps/data_frame.rb +141 -0
- data/lib/eps/evaluators/lightgbm.rb +116 -0
- data/lib/eps/evaluators/linear_regression.rb +54 -0
- data/lib/eps/evaluators/naive_bayes.rb +95 -0
- data/lib/eps/evaluators/node.rb +26 -0
- data/lib/eps/label_encoder.rb +41 -0
- data/lib/eps/lightgbm.rb +237 -0
- data/lib/eps/linear_regression.rb +132 -386
- data/lib/eps/metrics.rb +46 -0
- data/lib/eps/model.rb +16 -58
- data/lib/eps/naive_bayes.rb +175 -164
- data/lib/eps/pmml_generators/lightgbm.rb +187 -0
- data/lib/eps/statistics.rb +79 -0
- data/lib/eps/text_encoder.rb +81 -0
- data/lib/eps/utils.rb +22 -0
- data/lib/eps/version.rb +1 -1
- metadata +33 -7
@@ -0,0 +1,187 @@
|
|
1
|
+
module Eps
|
2
|
+
module PmmlGenerators
|
3
|
+
module LightGBM
|
4
|
+
private
|
5
|
+
|
6
|
+
def generate_pmml
|
7
|
+
feature_importance = @feature_importance
|
8
|
+
|
9
|
+
data_fields = {}
|
10
|
+
data_fields[@target] = @labels if @labels
|
11
|
+
@features.each_with_index do |(k, type), i|
|
12
|
+
# TODO remove zero importance features
|
13
|
+
if type == "categorical"
|
14
|
+
data_fields[k] = @label_encoders[k].labels.keys
|
15
|
+
else
|
16
|
+
data_fields[k] = nil
|
17
|
+
end
|
18
|
+
end
|
19
|
+
|
20
|
+
build_pmml(data_fields) do |xml|
|
21
|
+
function_name = @objective == "regression" ? "regression" : "classification"
|
22
|
+
xml.MiningModel(functionName: function_name, algorithmName: "LightGBM") do
|
23
|
+
xml.MiningSchema do
|
24
|
+
xml.MiningField(name: @target, usageType: "target")
|
25
|
+
@features.keys.each_with_index do |k, i|
|
26
|
+
# next if feature_importance[i] == 0
|
27
|
+
# TODO add importance, but need to handle text features
|
28
|
+
xml.MiningField(name: k) #, importance: feature_importance[i].to_f, missingValueTreatment: "asIs")
|
29
|
+
end
|
30
|
+
end
|
31
|
+
pmml_local_transformations(xml)
|
32
|
+
|
33
|
+
case @objective
|
34
|
+
when "regression"
|
35
|
+
xml_segmentation(xml, @trees)
|
36
|
+
when "binary"
|
37
|
+
xml.Segmentation(multipleModelMethod: "modelChain") do
|
38
|
+
xml.Segment(id: 1) do
|
39
|
+
xml.True
|
40
|
+
xml.MiningModel(functionName: "regression") do
|
41
|
+
xml.MiningSchema do
|
42
|
+
@features.each do |k, _|
|
43
|
+
xml.MiningField(name: k)
|
44
|
+
end
|
45
|
+
end
|
46
|
+
xml.Output do
|
47
|
+
xml.OutputField(name: "lgbmValue", optype: "continuous", dataType: "double", feature: "predictedValue", isFinalResult: false) do
|
48
|
+
xml.Apply(function: "/") do
|
49
|
+
xml.Constant(dataType: "double") do
|
50
|
+
1.0
|
51
|
+
end
|
52
|
+
xml.Apply(function: "+") do
|
53
|
+
xml.Constant(dataType: "double") do
|
54
|
+
1.0
|
55
|
+
end
|
56
|
+
xml.Apply(function: "exp") do
|
57
|
+
xml.Apply(function: "*") do
|
58
|
+
xml.Constant(dataType: "double") do
|
59
|
+
-1.0
|
60
|
+
end
|
61
|
+
xml.FieldRef(field: "lgbmValue")
|
62
|
+
end
|
63
|
+
end
|
64
|
+
end
|
65
|
+
end
|
66
|
+
end
|
67
|
+
end
|
68
|
+
xml_segmentation(xml, @trees)
|
69
|
+
end
|
70
|
+
end
|
71
|
+
xml.Segment(id: 2) do
|
72
|
+
xml.True
|
73
|
+
xml.RegressionModel(functionName: "classification", normalizationMethod: "none") do
|
74
|
+
xml.MiningSchema do
|
75
|
+
xml.MiningField(name: @target, usageType: "target")
|
76
|
+
xml.MiningField(name: "transformedLgbmValue")
|
77
|
+
end
|
78
|
+
xml.Output do
|
79
|
+
@labels.each do |label|
|
80
|
+
xml.OutputField(name: "probability(#{label})", optype: "continuous", dataType: "double", feature: "probability", value: label)
|
81
|
+
end
|
82
|
+
end
|
83
|
+
xml.RegressionTable(intercept: 0.0, targetCategory: @labels.last) do
|
84
|
+
xml.NumericPredictor(name: "transformedLgbmValue", coefficient: "1.0")
|
85
|
+
end
|
86
|
+
xml.RegressionTable(intercept: 0.0, targetCategory: @labels.first)
|
87
|
+
end
|
88
|
+
end
|
89
|
+
end
|
90
|
+
else # multiclass
|
91
|
+
xml.Segmentation(multipleModelMethod: "modelChain") do
|
92
|
+
n = @trees.size / @labels.size
|
93
|
+
@trees.each_slice(n).each_with_index do |trees, idx|
|
94
|
+
xml.Segment(id: idx + 1) do
|
95
|
+
xml.True
|
96
|
+
xml.MiningModel(functionName: "regression") do
|
97
|
+
xml.MiningSchema do
|
98
|
+
@features.each do |k, _|
|
99
|
+
xml.MiningField(name: k)
|
100
|
+
end
|
101
|
+
end
|
102
|
+
xml.Output do
|
103
|
+
xml.OutputField(name: "lgbmValue(#{@labels[idx]})", optype: "continuous", dataType: "double", feature: "predictedValue", isFinalResult: false)
|
104
|
+
end
|
105
|
+
xml_segmentation(xml, trees)
|
106
|
+
end
|
107
|
+
end
|
108
|
+
end
|
109
|
+
xml.Segment(id: @labels.size + 1) do
|
110
|
+
xml.True
|
111
|
+
xml.RegressionModel(functionName: "classification", normalizationMethod: "softmax") do
|
112
|
+
xml.MiningSchema do
|
113
|
+
xml.MiningField(name: @target, usageType: "target")
|
114
|
+
@labels.each do |label|
|
115
|
+
xml.MiningField(name: "lgbmValue(#{label})")
|
116
|
+
end
|
117
|
+
end
|
118
|
+
xml.Output do
|
119
|
+
@labels.each do |label|
|
120
|
+
xml.OutputField(name: "probability(#{label})", optype: "continuous", dataType: "double", feature: "probability", value: label)
|
121
|
+
end
|
122
|
+
end
|
123
|
+
@labels.each do |label|
|
124
|
+
xml.RegressionTable(intercept: 0.0, targetCategory: label) do
|
125
|
+
xml.NumericPredictor(name: "lgbmValue(#{label})", coefficient: "1.0")
|
126
|
+
end
|
127
|
+
end
|
128
|
+
end
|
129
|
+
end
|
130
|
+
end
|
131
|
+
end
|
132
|
+
end
|
133
|
+
end
|
134
|
+
end
|
135
|
+
|
136
|
+
def xml_segmentation(xml, trees)
|
137
|
+
xml.Segmentation(multipleModelMethod: "sum") do
|
138
|
+
trees.each_with_index do |node, i|
|
139
|
+
xml.Segment(id: i + 1) do
|
140
|
+
xml.True
|
141
|
+
xml.TreeModel(functionName: "regression", missingValueStrategy: "none", noTrueChildStrategy: "returnLastPrediction", splitCharacteristic: "multiSplit") do
|
142
|
+
xml.MiningSchema do
|
143
|
+
node_fields(node).uniq.each do |k|
|
144
|
+
xml.MiningField(name: display_field(k))
|
145
|
+
end
|
146
|
+
end
|
147
|
+
node_pmml(node, xml)
|
148
|
+
end
|
149
|
+
end
|
150
|
+
end
|
151
|
+
end
|
152
|
+
end
|
153
|
+
|
154
|
+
def node_fields(node)
|
155
|
+
fields = []
|
156
|
+
fields << node.field if node.predicate
|
157
|
+
node.children.each do |n|
|
158
|
+
fields.concat(node_fields(n))
|
159
|
+
end
|
160
|
+
fields
|
161
|
+
end
|
162
|
+
|
163
|
+
def node_pmml(node, xml)
|
164
|
+
xml.Node(score: node.score) do
|
165
|
+
if node.predicate.nil?
|
166
|
+
xml.True
|
167
|
+
elsif node.operator == "in"
|
168
|
+
xml.SimpleSetPredicate(field: display_field(node.field), booleanOperator: "isIn") do
|
169
|
+
xml.Array(type: "string") do
|
170
|
+
xml.text node.value.map { |v| escape_element(v) }.join(" ")
|
171
|
+
end
|
172
|
+
end
|
173
|
+
else
|
174
|
+
xml.SimplePredicate(field: display_field(node.field), operator: node.operator, value: node.value)
|
175
|
+
end
|
176
|
+
node.children.each do |n|
|
177
|
+
node_pmml(n, xml)
|
178
|
+
end
|
179
|
+
end
|
180
|
+
end
|
181
|
+
|
182
|
+
def escape_element(v)
|
183
|
+
"\"#{v.gsub("\"", "\\\"")}\""
|
184
|
+
end
|
185
|
+
end
|
186
|
+
end
|
187
|
+
end
|
@@ -0,0 +1,79 @@
|
|
1
|
+
### Extracted from https://github.com/estebanz01/ruby-statistics
|
2
|
+
### The Ruby author is Esteban Zapata Rojas
|
3
|
+
###
|
4
|
+
### Originally extracted from https://codeplea.com/incomplete-beta-function-c
|
5
|
+
### These functions shared under zlib license and the author is Lewis Van Winkle
|
6
|
+
|
7
|
+
module Eps
|
8
|
+
module Statistics
|
9
|
+
def self.tdist_p(value, degrees_of_freedom)
|
10
|
+
upper = (value + Math.sqrt(value * value + degrees_of_freedom))
|
11
|
+
lower = (2.0 * Math.sqrt(value * value + degrees_of_freedom))
|
12
|
+
|
13
|
+
x = upper/lower
|
14
|
+
|
15
|
+
alpha = degrees_of_freedom/2.0
|
16
|
+
beta = degrees_of_freedom/2.0
|
17
|
+
|
18
|
+
incomplete_beta_function(x, alpha, beta)
|
19
|
+
end
|
20
|
+
|
21
|
+
def self.incomplete_beta_function(x, alp, bet)
|
22
|
+
return if x < 0.0
|
23
|
+
return 1.0 if x > 1.0
|
24
|
+
|
25
|
+
tiny = 1.0E-50
|
26
|
+
|
27
|
+
if x > ((alp + 1.0)/(alp + bet + 2.0))
|
28
|
+
return 1.0 - incomplete_beta_function(1.0 - x, bet, alp)
|
29
|
+
end
|
30
|
+
|
31
|
+
# To avoid overflow problems, the implementation applies the logarithm properties
|
32
|
+
# to calculate in a faster and safer way the values.
|
33
|
+
lbet_ab = (Math.lgamma(alp)[0] + Math.lgamma(bet)[0] - Math.lgamma(alp + bet)[0]).freeze
|
34
|
+
front = (Math.exp(Math.log(x) * alp + Math.log(1.0 - x) * bet - lbet_ab) / alp.to_f).freeze
|
35
|
+
|
36
|
+
# This is the non-log version of the left part of the formula (before the continuous fraction)
|
37
|
+
# down_left = alp * self.beta_function(alp, bet)
|
38
|
+
# upper_left = (x ** alp) * ((1.0 - x) ** bet)
|
39
|
+
# front = upper_left/down_left
|
40
|
+
|
41
|
+
f, c, d = 1.0, 1.0, 0.0
|
42
|
+
|
43
|
+
returned_value = nil
|
44
|
+
|
45
|
+
# Let's do more iterations than the proposed implementation (200 iters)
|
46
|
+
(0..500).each do |number|
|
47
|
+
m = number/2
|
48
|
+
|
49
|
+
numerator = if number == 0
|
50
|
+
1.0
|
51
|
+
elsif number % 2 == 0
|
52
|
+
(m * (bet - m) * x)/((alp + 2.0 * m - 1.0)* (alp + 2.0 * m))
|
53
|
+
else
|
54
|
+
top = -((alp + m) * (alp + bet + m) * x)
|
55
|
+
down = ((alp + 2.0 * m) * (alp + 2.0 * m + 1.0))
|
56
|
+
|
57
|
+
top/down
|
58
|
+
end
|
59
|
+
|
60
|
+
d = 1.0 + numerator * d
|
61
|
+
d = tiny if d.abs < tiny
|
62
|
+
d = 1.0 / d
|
63
|
+
|
64
|
+
c = 1.0 + numerator / c
|
65
|
+
c = tiny if c.abs < tiny
|
66
|
+
|
67
|
+
cd = (c*d).freeze
|
68
|
+
f = f * cd
|
69
|
+
|
70
|
+
if (1.0 - cd).abs < 1.0E-10
|
71
|
+
returned_value = front * (f - 1.0)
|
72
|
+
break
|
73
|
+
end
|
74
|
+
end
|
75
|
+
|
76
|
+
returned_value
|
77
|
+
end
|
78
|
+
end
|
79
|
+
end
|
@@ -0,0 +1,81 @@
|
|
1
|
+
module Eps
|
2
|
+
class TextEncoder
|
3
|
+
attr_reader :options, :vocabulary
|
4
|
+
|
5
|
+
def initialize(**options)
|
6
|
+
@options = options
|
7
|
+
@vocabulary = options[:vocabulary] || []
|
8
|
+
end
|
9
|
+
|
10
|
+
def fit(arr)
|
11
|
+
counts, fit = count_and_fit(arr)
|
12
|
+
|
13
|
+
min_length = options[:min_length]
|
14
|
+
if min_length
|
15
|
+
counts.select! { |k, _| k.length >= min_length }
|
16
|
+
end
|
17
|
+
|
18
|
+
min_occurrences = options[:min_occurrences]
|
19
|
+
if min_occurrences
|
20
|
+
counts.select! { |_, v| v >= min_occurrences }
|
21
|
+
end
|
22
|
+
|
23
|
+
max_occurrences = options[:max_occurrences]
|
24
|
+
if max_occurrences
|
25
|
+
counts.reject! { |_, v| v > max_occurrences }
|
26
|
+
end
|
27
|
+
|
28
|
+
max_features = options[:max_features]
|
29
|
+
if max_features
|
30
|
+
counts = Hash[counts.sort_by { |_, v| -v }[0...max_features]]
|
31
|
+
end
|
32
|
+
|
33
|
+
@vocabulary = counts.keys
|
34
|
+
|
35
|
+
fit
|
36
|
+
end
|
37
|
+
|
38
|
+
def transform(arr)
|
39
|
+
counts, fit = count_and_fit(arr)
|
40
|
+
fit
|
41
|
+
end
|
42
|
+
|
43
|
+
private
|
44
|
+
|
45
|
+
def count_and_fit(arr)
|
46
|
+
tokenizer = options[:tokenizer]
|
47
|
+
stop_words = Array(options[:stop_words])
|
48
|
+
|
49
|
+
fit =
|
50
|
+
arr.map do |xi|
|
51
|
+
# tokenize
|
52
|
+
tokens = xi.to_s
|
53
|
+
tokens = tokens.downcase unless options[:case_sensitive]
|
54
|
+
tokens = tokens.split(tokenizer)
|
55
|
+
|
56
|
+
# remove stop words
|
57
|
+
tokens -= stop_words
|
58
|
+
|
59
|
+
# count
|
60
|
+
xc = Hash.new(0)
|
61
|
+
tokens.each do |token|
|
62
|
+
xc[token] += 1
|
63
|
+
end
|
64
|
+
xc
|
65
|
+
end
|
66
|
+
|
67
|
+
counts = Hash.new(0)
|
68
|
+
|
69
|
+
fit.each do |xc|
|
70
|
+
xc.each do |k2, v2|
|
71
|
+
counts[k2] += v2
|
72
|
+
end
|
73
|
+
end
|
74
|
+
|
75
|
+
# remove empty strings
|
76
|
+
counts.delete("")
|
77
|
+
|
78
|
+
[counts, fit]
|
79
|
+
end
|
80
|
+
end
|
81
|
+
end
|
data/lib/eps/utils.rb
ADDED
@@ -0,0 +1,22 @@
|
|
1
|
+
module Eps
|
2
|
+
module Utils
|
3
|
+
def self.column_type(c, k)
|
4
|
+
if !c
|
5
|
+
raise ArgumentError, "Missing column: #{k}"
|
6
|
+
elsif c.all? { |v| v.nil? }
|
7
|
+
# goes here for empty as well
|
8
|
+
nil
|
9
|
+
elsif c.any? { |v| v.nil? }
|
10
|
+
raise ArgumentError, "Missing values in column #{k}"
|
11
|
+
elsif c.all? { |v| v.is_a?(Numeric) }
|
12
|
+
"numeric"
|
13
|
+
elsif c.all? { |v| v.is_a?(String) }
|
14
|
+
"categorical"
|
15
|
+
elsif c.all? { |v| v == true || v == false }
|
16
|
+
"categorical" # boolean
|
17
|
+
else
|
18
|
+
raise ArgumentError, "Column values must be all numeric, all string, or all boolean: #{k}"
|
19
|
+
end
|
20
|
+
end
|
21
|
+
end
|
22
|
+
end
|
data/lib/eps/version.rb
CHANGED
metadata
CHANGED
@@ -1,23 +1,37 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: eps
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.3.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Andrew Kane
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2019-05
|
11
|
+
date: 2019-09-05 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
|
-
name:
|
14
|
+
name: lightgbm
|
15
|
+
requirement: !ruby/object:Gem::Requirement
|
16
|
+
requirements:
|
17
|
+
- - ">="
|
18
|
+
- !ruby/object:Gem::Version
|
19
|
+
version: 0.1.5
|
20
|
+
type: :runtime
|
21
|
+
prerelease: false
|
22
|
+
version_requirements: !ruby/object:Gem::Requirement
|
23
|
+
requirements:
|
24
|
+
- - ">="
|
25
|
+
- !ruby/object:Gem::Version
|
26
|
+
version: 0.1.5
|
27
|
+
- !ruby/object:Gem::Dependency
|
28
|
+
name: nokogiri
|
15
29
|
requirement: !ruby/object:Gem::Requirement
|
16
30
|
requirements:
|
17
31
|
- - ">="
|
18
32
|
- !ruby/object:Gem::Version
|
19
33
|
version: '0'
|
20
|
-
type: :
|
34
|
+
type: :runtime
|
21
35
|
prerelease: false
|
22
36
|
version_requirements: !ruby/object:Gem::Requirement
|
23
37
|
requirements:
|
@@ -25,7 +39,7 @@ dependencies:
|
|
25
39
|
- !ruby/object:Gem::Version
|
26
40
|
version: '0'
|
27
41
|
- !ruby/object:Gem::Dependency
|
28
|
-
name:
|
42
|
+
name: bundler
|
29
43
|
requirement: !ruby/object:Gem::Requirement
|
30
44
|
requirements:
|
31
45
|
- - ">="
|
@@ -39,7 +53,7 @@ dependencies:
|
|
39
53
|
- !ruby/object:Gem::Version
|
40
54
|
version: '0'
|
41
55
|
- !ruby/object:Gem::Dependency
|
42
|
-
name:
|
56
|
+
name: daru
|
43
57
|
requirement: !ruby/object:Gem::Requirement
|
44
58
|
requirements:
|
45
59
|
- - ">="
|
@@ -53,7 +67,7 @@ dependencies:
|
|
53
67
|
- !ruby/object:Gem::Version
|
54
68
|
version: '0'
|
55
69
|
- !ruby/object:Gem::Dependency
|
56
|
-
name:
|
70
|
+
name: minitest
|
57
71
|
requirement: !ruby/object:Gem::Requirement
|
58
72
|
requirements:
|
59
73
|
- - ">="
|
@@ -92,9 +106,21 @@ files:
|
|
92
106
|
- lib/eps.rb
|
93
107
|
- lib/eps/base.rb
|
94
108
|
- lib/eps/base_estimator.rb
|
109
|
+
- lib/eps/data_frame.rb
|
110
|
+
- lib/eps/evaluators/lightgbm.rb
|
111
|
+
- lib/eps/evaluators/linear_regression.rb
|
112
|
+
- lib/eps/evaluators/naive_bayes.rb
|
113
|
+
- lib/eps/evaluators/node.rb
|
114
|
+
- lib/eps/label_encoder.rb
|
115
|
+
- lib/eps/lightgbm.rb
|
95
116
|
- lib/eps/linear_regression.rb
|
117
|
+
- lib/eps/metrics.rb
|
96
118
|
- lib/eps/model.rb
|
97
119
|
- lib/eps/naive_bayes.rb
|
120
|
+
- lib/eps/pmml_generators/lightgbm.rb
|
121
|
+
- lib/eps/statistics.rb
|
122
|
+
- lib/eps/text_encoder.rb
|
123
|
+
- lib/eps/utils.rb
|
98
124
|
- lib/eps/version.rb
|
99
125
|
homepage: https://github.com/ankane/eps
|
100
126
|
licenses:
|