eps 0.2.1 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,187 @@
1
+ module Eps
2
+ module PmmlGenerators
3
+ module LightGBM
4
+ private
5
+
6
+ def generate_pmml
7
+ feature_importance = @feature_importance
8
+
9
+ data_fields = {}
10
+ data_fields[@target] = @labels if @labels
11
+ @features.each_with_index do |(k, type), i|
12
+ # TODO remove zero importance features
13
+ if type == "categorical"
14
+ data_fields[k] = @label_encoders[k].labels.keys
15
+ else
16
+ data_fields[k] = nil
17
+ end
18
+ end
19
+
20
+ build_pmml(data_fields) do |xml|
21
+ function_name = @objective == "regression" ? "regression" : "classification"
22
+ xml.MiningModel(functionName: function_name, algorithmName: "LightGBM") do
23
+ xml.MiningSchema do
24
+ xml.MiningField(name: @target, usageType: "target")
25
+ @features.keys.each_with_index do |k, i|
26
+ # next if feature_importance[i] == 0
27
+ # TODO add importance, but need to handle text features
28
+ xml.MiningField(name: k) #, importance: feature_importance[i].to_f, missingValueTreatment: "asIs")
29
+ end
30
+ end
31
+ pmml_local_transformations(xml)
32
+
33
+ case @objective
34
+ when "regression"
35
+ xml_segmentation(xml, @trees)
36
+ when "binary"
37
+ xml.Segmentation(multipleModelMethod: "modelChain") do
38
+ xml.Segment(id: 1) do
39
+ xml.True
40
+ xml.MiningModel(functionName: "regression") do
41
+ xml.MiningSchema do
42
+ @features.each do |k, _|
43
+ xml.MiningField(name: k)
44
+ end
45
+ end
46
+ xml.Output do
47
+ xml.OutputField(name: "lgbmValue", optype: "continuous", dataType: "double", feature: "predictedValue", isFinalResult: false) do
48
+ xml.Apply(function: "/") do
49
+ xml.Constant(dataType: "double") do
50
+ 1.0
51
+ end
52
+ xml.Apply(function: "+") do
53
+ xml.Constant(dataType: "double") do
54
+ 1.0
55
+ end
56
+ xml.Apply(function: "exp") do
57
+ xml.Apply(function: "*") do
58
+ xml.Constant(dataType: "double") do
59
+ -1.0
60
+ end
61
+ xml.FieldRef(field: "lgbmValue")
62
+ end
63
+ end
64
+ end
65
+ end
66
+ end
67
+ end
68
+ xml_segmentation(xml, @trees)
69
+ end
70
+ end
71
+ xml.Segment(id: 2) do
72
+ xml.True
73
+ xml.RegressionModel(functionName: "classification", normalizationMethod: "none") do
74
+ xml.MiningSchema do
75
+ xml.MiningField(name: @target, usageType: "target")
76
+ xml.MiningField(name: "transformedLgbmValue")
77
+ end
78
+ xml.Output do
79
+ @labels.each do |label|
80
+ xml.OutputField(name: "probability(#{label})", optype: "continuous", dataType: "double", feature: "probability", value: label)
81
+ end
82
+ end
83
+ xml.RegressionTable(intercept: 0.0, targetCategory: @labels.last) do
84
+ xml.NumericPredictor(name: "transformedLgbmValue", coefficient: "1.0")
85
+ end
86
+ xml.RegressionTable(intercept: 0.0, targetCategory: @labels.first)
87
+ end
88
+ end
89
+ end
90
+ else # multiclass
91
+ xml.Segmentation(multipleModelMethod: "modelChain") do
92
+ n = @trees.size / @labels.size
93
+ @trees.each_slice(n).each_with_index do |trees, idx|
94
+ xml.Segment(id: idx + 1) do
95
+ xml.True
96
+ xml.MiningModel(functionName: "regression") do
97
+ xml.MiningSchema do
98
+ @features.each do |k, _|
99
+ xml.MiningField(name: k)
100
+ end
101
+ end
102
+ xml.Output do
103
+ xml.OutputField(name: "lgbmValue(#{@labels[idx]})", optype: "continuous", dataType: "double", feature: "predictedValue", isFinalResult: false)
104
+ end
105
+ xml_segmentation(xml, trees)
106
+ end
107
+ end
108
+ end
109
+ xml.Segment(id: @labels.size + 1) do
110
+ xml.True
111
+ xml.RegressionModel(functionName: "classification", normalizationMethod: "softmax") do
112
+ xml.MiningSchema do
113
+ xml.MiningField(name: @target, usageType: "target")
114
+ @labels.each do |label|
115
+ xml.MiningField(name: "lgbmValue(#{label})")
116
+ end
117
+ end
118
+ xml.Output do
119
+ @labels.each do |label|
120
+ xml.OutputField(name: "probability(#{label})", optype: "continuous", dataType: "double", feature: "probability", value: label)
121
+ end
122
+ end
123
+ @labels.each do |label|
124
+ xml.RegressionTable(intercept: 0.0, targetCategory: label) do
125
+ xml.NumericPredictor(name: "lgbmValue(#{label})", coefficient: "1.0")
126
+ end
127
+ end
128
+ end
129
+ end
130
+ end
131
+ end
132
+ end
133
+ end
134
+ end
135
+
136
+ def xml_segmentation(xml, trees)
137
+ xml.Segmentation(multipleModelMethod: "sum") do
138
+ trees.each_with_index do |node, i|
139
+ xml.Segment(id: i + 1) do
140
+ xml.True
141
+ xml.TreeModel(functionName: "regression", missingValueStrategy: "none", noTrueChildStrategy: "returnLastPrediction", splitCharacteristic: "multiSplit") do
142
+ xml.MiningSchema do
143
+ node_fields(node).uniq.each do |k|
144
+ xml.MiningField(name: display_field(k))
145
+ end
146
+ end
147
+ node_pmml(node, xml)
148
+ end
149
+ end
150
+ end
151
+ end
152
+ end
153
+
154
+ def node_fields(node)
155
+ fields = []
156
+ fields << node.field if node.predicate
157
+ node.children.each do |n|
158
+ fields.concat(node_fields(n))
159
+ end
160
+ fields
161
+ end
162
+
163
+ def node_pmml(node, xml)
164
+ xml.Node(score: node.score) do
165
+ if node.predicate.nil?
166
+ xml.True
167
+ elsif node.operator == "in"
168
+ xml.SimpleSetPredicate(field: display_field(node.field), booleanOperator: "isIn") do
169
+ xml.Array(type: "string") do
170
+ xml.text node.value.map { |v| escape_element(v) }.join(" ")
171
+ end
172
+ end
173
+ else
174
+ xml.SimplePredicate(field: display_field(node.field), operator: node.operator, value: node.value)
175
+ end
176
+ node.children.each do |n|
177
+ node_pmml(n, xml)
178
+ end
179
+ end
180
+ end
181
+
182
+ def escape_element(v)
183
+ "\"#{v.gsub("\"", "\\\"")}\""
184
+ end
185
+ end
186
+ end
187
+ end
@@ -0,0 +1,79 @@
1
+ ### Extracted from https://github.com/estebanz01/ruby-statistics
2
+ ### The Ruby author is Esteban Zapata Rojas
3
+ ###
4
+ ### Originally extracted from https://codeplea.com/incomplete-beta-function-c
5
+ ### These functions shared under zlib license and the author is Lewis Van Winkle
6
+
7
+ module Eps
8
+ module Statistics
9
+ def self.tdist_p(value, degrees_of_freedom)
10
+ upper = (value + Math.sqrt(value * value + degrees_of_freedom))
11
+ lower = (2.0 * Math.sqrt(value * value + degrees_of_freedom))
12
+
13
+ x = upper/lower
14
+
15
+ alpha = degrees_of_freedom/2.0
16
+ beta = degrees_of_freedom/2.0
17
+
18
+ incomplete_beta_function(x, alpha, beta)
19
+ end
20
+
21
+ def self.incomplete_beta_function(x, alp, bet)
22
+ return if x < 0.0
23
+ return 1.0 if x > 1.0
24
+
25
+ tiny = 1.0E-50
26
+
27
+ if x > ((alp + 1.0)/(alp + bet + 2.0))
28
+ return 1.0 - incomplete_beta_function(1.0 - x, bet, alp)
29
+ end
30
+
31
+ # To avoid overflow problems, the implementation applies the logarithm properties
32
+ # to calculate in a faster and safer way the values.
33
+ lbet_ab = (Math.lgamma(alp)[0] + Math.lgamma(bet)[0] - Math.lgamma(alp + bet)[0]).freeze
34
+ front = (Math.exp(Math.log(x) * alp + Math.log(1.0 - x) * bet - lbet_ab) / alp.to_f).freeze
35
+
36
+ # This is the non-log version of the left part of the formula (before the continuous fraction)
37
+ # down_left = alp * self.beta_function(alp, bet)
38
+ # upper_left = (x ** alp) * ((1.0 - x) ** bet)
39
+ # front = upper_left/down_left
40
+
41
+ f, c, d = 1.0, 1.0, 0.0
42
+
43
+ returned_value = nil
44
+
45
+ # Let's do more iterations than the proposed implementation (200 iters)
46
+ (0..500).each do |number|
47
+ m = number/2
48
+
49
+ numerator = if number == 0
50
+ 1.0
51
+ elsif number % 2 == 0
52
+ (m * (bet - m) * x)/((alp + 2.0 * m - 1.0)* (alp + 2.0 * m))
53
+ else
54
+ top = -((alp + m) * (alp + bet + m) * x)
55
+ down = ((alp + 2.0 * m) * (alp + 2.0 * m + 1.0))
56
+
57
+ top/down
58
+ end
59
+
60
+ d = 1.0 + numerator * d
61
+ d = tiny if d.abs < tiny
62
+ d = 1.0 / d
63
+
64
+ c = 1.0 + numerator / c
65
+ c = tiny if c.abs < tiny
66
+
67
+ cd = (c*d).freeze
68
+ f = f * cd
69
+
70
+ if (1.0 - cd).abs < 1.0E-10
71
+ returned_value = front * (f - 1.0)
72
+ break
73
+ end
74
+ end
75
+
76
+ returned_value
77
+ end
78
+ end
79
+ end
@@ -0,0 +1,81 @@
1
+ module Eps
2
+ class TextEncoder
3
+ attr_reader :options, :vocabulary
4
+
5
+ def initialize(**options)
6
+ @options = options
7
+ @vocabulary = options[:vocabulary] || []
8
+ end
9
+
10
+ def fit(arr)
11
+ counts, fit = count_and_fit(arr)
12
+
13
+ min_length = options[:min_length]
14
+ if min_length
15
+ counts.select! { |k, _| k.length >= min_length }
16
+ end
17
+
18
+ min_occurrences = options[:min_occurrences]
19
+ if min_occurrences
20
+ counts.select! { |_, v| v >= min_occurrences }
21
+ end
22
+
23
+ max_occurrences = options[:max_occurrences]
24
+ if max_occurrences
25
+ counts.reject! { |_, v| v > max_occurrences }
26
+ end
27
+
28
+ max_features = options[:max_features]
29
+ if max_features
30
+ counts = Hash[counts.sort_by { |_, v| -v }[0...max_features]]
31
+ end
32
+
33
+ @vocabulary = counts.keys
34
+
35
+ fit
36
+ end
37
+
38
+ def transform(arr)
39
+ counts, fit = count_and_fit(arr)
40
+ fit
41
+ end
42
+
43
+ private
44
+
45
+ def count_and_fit(arr)
46
+ tokenizer = options[:tokenizer]
47
+ stop_words = Array(options[:stop_words])
48
+
49
+ fit =
50
+ arr.map do |xi|
51
+ # tokenize
52
+ tokens = xi.to_s
53
+ tokens = tokens.downcase unless options[:case_sensitive]
54
+ tokens = tokens.split(tokenizer)
55
+
56
+ # remove stop words
57
+ tokens -= stop_words
58
+
59
+ # count
60
+ xc = Hash.new(0)
61
+ tokens.each do |token|
62
+ xc[token] += 1
63
+ end
64
+ xc
65
+ end
66
+
67
+ counts = Hash.new(0)
68
+
69
+ fit.each do |xc|
70
+ xc.each do |k2, v2|
71
+ counts[k2] += v2
72
+ end
73
+ end
74
+
75
+ # remove empty strings
76
+ counts.delete("")
77
+
78
+ [counts, fit]
79
+ end
80
+ end
81
+ end
data/lib/eps/utils.rb ADDED
@@ -0,0 +1,22 @@
1
+ module Eps
2
+ module Utils
3
+ def self.column_type(c, k)
4
+ if !c
5
+ raise ArgumentError, "Missing column: #{k}"
6
+ elsif c.all? { |v| v.nil? }
7
+ # goes here for empty as well
8
+ nil
9
+ elsif c.any? { |v| v.nil? }
10
+ raise ArgumentError, "Missing values in column #{k}"
11
+ elsif c.all? { |v| v.is_a?(Numeric) }
12
+ "numeric"
13
+ elsif c.all? { |v| v.is_a?(String) }
14
+ "categorical"
15
+ elsif c.all? { |v| v == true || v == false }
16
+ "categorical" # boolean
17
+ else
18
+ raise ArgumentError, "Column values must be all numeric, all string, or all boolean: #{k}"
19
+ end
20
+ end
21
+ end
22
+ end
data/lib/eps/version.rb CHANGED
@@ -1,3 +1,3 @@
1
1
  module Eps
2
- VERSION = "0.2.1"
2
+ VERSION = "0.3.0"
3
3
  end
metadata CHANGED
@@ -1,23 +1,37 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: eps
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.2.1
4
+ version: 0.3.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Andrew Kane
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2019-05-19 00:00:00.000000000 Z
11
+ date: 2019-09-05 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
- name: bundler
14
+ name: lightgbm
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - ">="
18
+ - !ruby/object:Gem::Version
19
+ version: 0.1.5
20
+ type: :runtime
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - ">="
25
+ - !ruby/object:Gem::Version
26
+ version: 0.1.5
27
+ - !ruby/object:Gem::Dependency
28
+ name: nokogiri
15
29
  requirement: !ruby/object:Gem::Requirement
16
30
  requirements:
17
31
  - - ">="
18
32
  - !ruby/object:Gem::Version
19
33
  version: '0'
20
- type: :development
34
+ type: :runtime
21
35
  prerelease: false
22
36
  version_requirements: !ruby/object:Gem::Requirement
23
37
  requirements:
@@ -25,7 +39,7 @@ dependencies:
25
39
  - !ruby/object:Gem::Version
26
40
  version: '0'
27
41
  - !ruby/object:Gem::Dependency
28
- name: daru
42
+ name: bundler
29
43
  requirement: !ruby/object:Gem::Requirement
30
44
  requirements:
31
45
  - - ">="
@@ -39,7 +53,7 @@ dependencies:
39
53
  - !ruby/object:Gem::Version
40
54
  version: '0'
41
55
  - !ruby/object:Gem::Dependency
42
- name: minitest
56
+ name: daru
43
57
  requirement: !ruby/object:Gem::Requirement
44
58
  requirements:
45
59
  - - ">="
@@ -53,7 +67,7 @@ dependencies:
53
67
  - !ruby/object:Gem::Version
54
68
  version: '0'
55
69
  - !ruby/object:Gem::Dependency
56
- name: nokogiri
70
+ name: minitest
57
71
  requirement: !ruby/object:Gem::Requirement
58
72
  requirements:
59
73
  - - ">="
@@ -92,9 +106,21 @@ files:
92
106
  - lib/eps.rb
93
107
  - lib/eps/base.rb
94
108
  - lib/eps/base_estimator.rb
109
+ - lib/eps/data_frame.rb
110
+ - lib/eps/evaluators/lightgbm.rb
111
+ - lib/eps/evaluators/linear_regression.rb
112
+ - lib/eps/evaluators/naive_bayes.rb
113
+ - lib/eps/evaluators/node.rb
114
+ - lib/eps/label_encoder.rb
115
+ - lib/eps/lightgbm.rb
95
116
  - lib/eps/linear_regression.rb
117
+ - lib/eps/metrics.rb
96
118
  - lib/eps/model.rb
97
119
  - lib/eps/naive_bayes.rb
120
+ - lib/eps/pmml_generators/lightgbm.rb
121
+ - lib/eps/statistics.rb
122
+ - lib/eps/text_encoder.rb
123
+ - lib/eps/utils.rb
98
124
  - lib/eps/version.rb
99
125
  homepage: https://github.com/ankane/eps
100
126
  licenses: