eps 0.2.1 → 0.3.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,187 @@
1
+ module Eps
2
+ module PmmlGenerators
3
+ module LightGBM
4
+ private
5
+
6
+ def generate_pmml
7
+ feature_importance = @feature_importance
8
+
9
+ data_fields = {}
10
+ data_fields[@target] = @labels if @labels
11
+ @features.each_with_index do |(k, type), i|
12
+ # TODO remove zero importance features
13
+ if type == "categorical"
14
+ data_fields[k] = @label_encoders[k].labels.keys
15
+ else
16
+ data_fields[k] = nil
17
+ end
18
+ end
19
+
20
+ build_pmml(data_fields) do |xml|
21
+ function_name = @objective == "regression" ? "regression" : "classification"
22
+ xml.MiningModel(functionName: function_name, algorithmName: "LightGBM") do
23
+ xml.MiningSchema do
24
+ xml.MiningField(name: @target, usageType: "target")
25
+ @features.keys.each_with_index do |k, i|
26
+ # next if feature_importance[i] == 0
27
+ # TODO add importance, but need to handle text features
28
+ xml.MiningField(name: k) #, importance: feature_importance[i].to_f, missingValueTreatment: "asIs")
29
+ end
30
+ end
31
+ pmml_local_transformations(xml)
32
+
33
+ case @objective
34
+ when "regression"
35
+ xml_segmentation(xml, @trees)
36
+ when "binary"
37
+ xml.Segmentation(multipleModelMethod: "modelChain") do
38
+ xml.Segment(id: 1) do
39
+ xml.True
40
+ xml.MiningModel(functionName: "regression") do
41
+ xml.MiningSchema do
42
+ @features.each do |k, _|
43
+ xml.MiningField(name: k)
44
+ end
45
+ end
46
+ xml.Output do
47
+ xml.OutputField(name: "lgbmValue", optype: "continuous", dataType: "double", feature: "predictedValue", isFinalResult: false) do
48
+ xml.Apply(function: "/") do
49
+ xml.Constant(dataType: "double") do
50
+ 1.0
51
+ end
52
+ xml.Apply(function: "+") do
53
+ xml.Constant(dataType: "double") do
54
+ 1.0
55
+ end
56
+ xml.Apply(function: "exp") do
57
+ xml.Apply(function: "*") do
58
+ xml.Constant(dataType: "double") do
59
+ -1.0
60
+ end
61
+ xml.FieldRef(field: "lgbmValue")
62
+ end
63
+ end
64
+ end
65
+ end
66
+ end
67
+ end
68
+ xml_segmentation(xml, @trees)
69
+ end
70
+ end
71
+ xml.Segment(id: 2) do
72
+ xml.True
73
+ xml.RegressionModel(functionName: "classification", normalizationMethod: "none") do
74
+ xml.MiningSchema do
75
+ xml.MiningField(name: @target, usageType: "target")
76
+ xml.MiningField(name: "transformedLgbmValue")
77
+ end
78
+ xml.Output do
79
+ @labels.each do |label|
80
+ xml.OutputField(name: "probability(#{label})", optype: "continuous", dataType: "double", feature: "probability", value: label)
81
+ end
82
+ end
83
+ xml.RegressionTable(intercept: 0.0, targetCategory: @labels.last) do
84
+ xml.NumericPredictor(name: "transformedLgbmValue", coefficient: "1.0")
85
+ end
86
+ xml.RegressionTable(intercept: 0.0, targetCategory: @labels.first)
87
+ end
88
+ end
89
+ end
90
+ else # multiclass
91
+ xml.Segmentation(multipleModelMethod: "modelChain") do
92
+ n = @trees.size / @labels.size
93
+ @trees.each_slice(n).each_with_index do |trees, idx|
94
+ xml.Segment(id: idx + 1) do
95
+ xml.True
96
+ xml.MiningModel(functionName: "regression") do
97
+ xml.MiningSchema do
98
+ @features.each do |k, _|
99
+ xml.MiningField(name: k)
100
+ end
101
+ end
102
+ xml.Output do
103
+ xml.OutputField(name: "lgbmValue(#{@labels[idx]})", optype: "continuous", dataType: "double", feature: "predictedValue", isFinalResult: false)
104
+ end
105
+ xml_segmentation(xml, trees)
106
+ end
107
+ end
108
+ end
109
+ xml.Segment(id: @labels.size + 1) do
110
+ xml.True
111
+ xml.RegressionModel(functionName: "classification", normalizationMethod: "softmax") do
112
+ xml.MiningSchema do
113
+ xml.MiningField(name: @target, usageType: "target")
114
+ @labels.each do |label|
115
+ xml.MiningField(name: "lgbmValue(#{label})")
116
+ end
117
+ end
118
+ xml.Output do
119
+ @labels.each do |label|
120
+ xml.OutputField(name: "probability(#{label})", optype: "continuous", dataType: "double", feature: "probability", value: label)
121
+ end
122
+ end
123
+ @labels.each do |label|
124
+ xml.RegressionTable(intercept: 0.0, targetCategory: label) do
125
+ xml.NumericPredictor(name: "lgbmValue(#{label})", coefficient: "1.0")
126
+ end
127
+ end
128
+ end
129
+ end
130
+ end
131
+ end
132
+ end
133
+ end
134
+ end
135
+
136
+ def xml_segmentation(xml, trees)
137
+ xml.Segmentation(multipleModelMethod: "sum") do
138
+ trees.each_with_index do |node, i|
139
+ xml.Segment(id: i + 1) do
140
+ xml.True
141
+ xml.TreeModel(functionName: "regression", missingValueStrategy: "none", noTrueChildStrategy: "returnLastPrediction", splitCharacteristic: "multiSplit") do
142
+ xml.MiningSchema do
143
+ node_fields(node).uniq.each do |k|
144
+ xml.MiningField(name: display_field(k))
145
+ end
146
+ end
147
+ node_pmml(node, xml)
148
+ end
149
+ end
150
+ end
151
+ end
152
+ end
153
+
154
+ def node_fields(node)
155
+ fields = []
156
+ fields << node.field if node.predicate
157
+ node.children.each do |n|
158
+ fields.concat(node_fields(n))
159
+ end
160
+ fields
161
+ end
162
+
163
+ def node_pmml(node, xml)
164
+ xml.Node(score: node.score) do
165
+ if node.predicate.nil?
166
+ xml.True
167
+ elsif node.operator == "in"
168
+ xml.SimpleSetPredicate(field: display_field(node.field), booleanOperator: "isIn") do
169
+ xml.Array(type: "string") do
170
+ xml.text node.value.map { |v| escape_element(v) }.join(" ")
171
+ end
172
+ end
173
+ else
174
+ xml.SimplePredicate(field: display_field(node.field), operator: node.operator, value: node.value)
175
+ end
176
+ node.children.each do |n|
177
+ node_pmml(n, xml)
178
+ end
179
+ end
180
+ end
181
+
182
+ def escape_element(v)
183
+ "\"#{v.gsub("\"", "\\\"")}\""
184
+ end
185
+ end
186
+ end
187
+ end
@@ -0,0 +1,79 @@
1
+ ### Extracted from https://github.com/estebanz01/ruby-statistics
2
+ ### The Ruby author is Esteban Zapata Rojas
3
+ ###
4
+ ### Originally extracted from https://codeplea.com/incomplete-beta-function-c
5
+ ### These functions shared under zlib license and the author is Lewis Van Winkle
6
+
7
+ module Eps
8
+ module Statistics
9
+ def self.tdist_p(value, degrees_of_freedom)
10
+ upper = (value + Math.sqrt(value * value + degrees_of_freedom))
11
+ lower = (2.0 * Math.sqrt(value * value + degrees_of_freedom))
12
+
13
+ x = upper/lower
14
+
15
+ alpha = degrees_of_freedom/2.0
16
+ beta = degrees_of_freedom/2.0
17
+
18
+ incomplete_beta_function(x, alpha, beta)
19
+ end
20
+
21
+ def self.incomplete_beta_function(x, alp, bet)
22
+ return if x < 0.0
23
+ return 1.0 if x > 1.0
24
+
25
+ tiny = 1.0E-50
26
+
27
+ if x > ((alp + 1.0)/(alp + bet + 2.0))
28
+ return 1.0 - incomplete_beta_function(1.0 - x, bet, alp)
29
+ end
30
+
31
+ # To avoid overflow problems, the implementation applies the logarithm properties
32
+ # to calculate in a faster and safer way the values.
33
+ lbet_ab = (Math.lgamma(alp)[0] + Math.lgamma(bet)[0] - Math.lgamma(alp + bet)[0]).freeze
34
+ front = (Math.exp(Math.log(x) * alp + Math.log(1.0 - x) * bet - lbet_ab) / alp.to_f).freeze
35
+
36
+ # This is the non-log version of the left part of the formula (before the continuous fraction)
37
+ # down_left = alp * self.beta_function(alp, bet)
38
+ # upper_left = (x ** alp) * ((1.0 - x) ** bet)
39
+ # front = upper_left/down_left
40
+
41
+ f, c, d = 1.0, 1.0, 0.0
42
+
43
+ returned_value = nil
44
+
45
+ # Let's do more iterations than the proposed implementation (200 iters)
46
+ (0..500).each do |number|
47
+ m = number/2
48
+
49
+ numerator = if number == 0
50
+ 1.0
51
+ elsif number % 2 == 0
52
+ (m * (bet - m) * x)/((alp + 2.0 * m - 1.0)* (alp + 2.0 * m))
53
+ else
54
+ top = -((alp + m) * (alp + bet + m) * x)
55
+ down = ((alp + 2.0 * m) * (alp + 2.0 * m + 1.0))
56
+
57
+ top/down
58
+ end
59
+
60
+ d = 1.0 + numerator * d
61
+ d = tiny if d.abs < tiny
62
+ d = 1.0 / d
63
+
64
+ c = 1.0 + numerator / c
65
+ c = tiny if c.abs < tiny
66
+
67
+ cd = (c*d).freeze
68
+ f = f * cd
69
+
70
+ if (1.0 - cd).abs < 1.0E-10
71
+ returned_value = front * (f - 1.0)
72
+ break
73
+ end
74
+ end
75
+
76
+ returned_value
77
+ end
78
+ end
79
+ end
@@ -0,0 +1,81 @@
1
+ module Eps
2
+ class TextEncoder
3
+ attr_reader :options, :vocabulary
4
+
5
+ def initialize(**options)
6
+ @options = options
7
+ @vocabulary = options[:vocabulary] || []
8
+ end
9
+
10
+ def fit(arr)
11
+ counts, fit = count_and_fit(arr)
12
+
13
+ min_length = options[:min_length]
14
+ if min_length
15
+ counts.select! { |k, _| k.length >= min_length }
16
+ end
17
+
18
+ min_occurrences = options[:min_occurrences]
19
+ if min_occurrences
20
+ counts.select! { |_, v| v >= min_occurrences }
21
+ end
22
+
23
+ max_occurrences = options[:max_occurrences]
24
+ if max_occurrences
25
+ counts.reject! { |_, v| v > max_occurrences }
26
+ end
27
+
28
+ max_features = options[:max_features]
29
+ if max_features
30
+ counts = Hash[counts.sort_by { |_, v| -v }[0...max_features]]
31
+ end
32
+
33
+ @vocabulary = counts.keys
34
+
35
+ fit
36
+ end
37
+
38
+ def transform(arr)
39
+ counts, fit = count_and_fit(arr)
40
+ fit
41
+ end
42
+
43
+ private
44
+
45
+ def count_and_fit(arr)
46
+ tokenizer = options[:tokenizer]
47
+ stop_words = Array(options[:stop_words])
48
+
49
+ fit =
50
+ arr.map do |xi|
51
+ # tokenize
52
+ tokens = xi.to_s
53
+ tokens = tokens.downcase unless options[:case_sensitive]
54
+ tokens = tokens.split(tokenizer)
55
+
56
+ # remove stop words
57
+ tokens -= stop_words
58
+
59
+ # count
60
+ xc = Hash.new(0)
61
+ tokens.each do |token|
62
+ xc[token] += 1
63
+ end
64
+ xc
65
+ end
66
+
67
+ counts = Hash.new(0)
68
+
69
+ fit.each do |xc|
70
+ xc.each do |k2, v2|
71
+ counts[k2] += v2
72
+ end
73
+ end
74
+
75
+ # remove empty strings
76
+ counts.delete("")
77
+
78
+ [counts, fit]
79
+ end
80
+ end
81
+ end
data/lib/eps/utils.rb ADDED
@@ -0,0 +1,22 @@
1
+ module Eps
2
+ module Utils
3
+ def self.column_type(c, k)
4
+ if !c
5
+ raise ArgumentError, "Missing column: #{k}"
6
+ elsif c.all? { |v| v.nil? }
7
+ # goes here for empty as well
8
+ nil
9
+ elsif c.any? { |v| v.nil? }
10
+ raise ArgumentError, "Missing values in column #{k}"
11
+ elsif c.all? { |v| v.is_a?(Numeric) }
12
+ "numeric"
13
+ elsif c.all? { |v| v.is_a?(String) }
14
+ "categorical"
15
+ elsif c.all? { |v| v == true || v == false }
16
+ "categorical" # boolean
17
+ else
18
+ raise ArgumentError, "Column values must be all numeric, all string, or all boolean: #{k}"
19
+ end
20
+ end
21
+ end
22
+ end
data/lib/eps/version.rb CHANGED
@@ -1,3 +1,3 @@
1
1
  module Eps
2
- VERSION = "0.2.1"
2
+ VERSION = "0.3.0"
3
3
  end
metadata CHANGED
@@ -1,23 +1,37 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: eps
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.2.1
4
+ version: 0.3.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Andrew Kane
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2019-05-19 00:00:00.000000000 Z
11
+ date: 2019-09-05 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
- name: bundler
14
+ name: lightgbm
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - ">="
18
+ - !ruby/object:Gem::Version
19
+ version: 0.1.5
20
+ type: :runtime
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - ">="
25
+ - !ruby/object:Gem::Version
26
+ version: 0.1.5
27
+ - !ruby/object:Gem::Dependency
28
+ name: nokogiri
15
29
  requirement: !ruby/object:Gem::Requirement
16
30
  requirements:
17
31
  - - ">="
18
32
  - !ruby/object:Gem::Version
19
33
  version: '0'
20
- type: :development
34
+ type: :runtime
21
35
  prerelease: false
22
36
  version_requirements: !ruby/object:Gem::Requirement
23
37
  requirements:
@@ -25,7 +39,7 @@ dependencies:
25
39
  - !ruby/object:Gem::Version
26
40
  version: '0'
27
41
  - !ruby/object:Gem::Dependency
28
- name: daru
42
+ name: bundler
29
43
  requirement: !ruby/object:Gem::Requirement
30
44
  requirements:
31
45
  - - ">="
@@ -39,7 +53,7 @@ dependencies:
39
53
  - !ruby/object:Gem::Version
40
54
  version: '0'
41
55
  - !ruby/object:Gem::Dependency
42
- name: minitest
56
+ name: daru
43
57
  requirement: !ruby/object:Gem::Requirement
44
58
  requirements:
45
59
  - - ">="
@@ -53,7 +67,7 @@ dependencies:
53
67
  - !ruby/object:Gem::Version
54
68
  version: '0'
55
69
  - !ruby/object:Gem::Dependency
56
- name: nokogiri
70
+ name: minitest
57
71
  requirement: !ruby/object:Gem::Requirement
58
72
  requirements:
59
73
  - - ">="
@@ -92,9 +106,21 @@ files:
92
106
  - lib/eps.rb
93
107
  - lib/eps/base.rb
94
108
  - lib/eps/base_estimator.rb
109
+ - lib/eps/data_frame.rb
110
+ - lib/eps/evaluators/lightgbm.rb
111
+ - lib/eps/evaluators/linear_regression.rb
112
+ - lib/eps/evaluators/naive_bayes.rb
113
+ - lib/eps/evaluators/node.rb
114
+ - lib/eps/label_encoder.rb
115
+ - lib/eps/lightgbm.rb
95
116
  - lib/eps/linear_regression.rb
117
+ - lib/eps/metrics.rb
96
118
  - lib/eps/model.rb
97
119
  - lib/eps/naive_bayes.rb
120
+ - lib/eps/pmml_generators/lightgbm.rb
121
+ - lib/eps/statistics.rb
122
+ - lib/eps/text_encoder.rb
123
+ - lib/eps/utils.rb
98
124
  - lib/eps/version.rb
99
125
  homepage: https://github.com/ankane/eps
100
126
  licenses: