rbbt-dm 1.1.46 → 1.1.51

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 5f70d6a55c5852ffdafd59a3199b41031ee5a5aaa4becc56ee2f7f49be3b5d43
4
- data.tar.gz: f5d53ca95b6af4d6c2b63f00bbbfd398e3fa5a6feea15885b00ccd946f3aa765
3
+ metadata.gz: 5327d1c2a46283b95fa380a73be418cf7e5a3afd2aca6002bd9ca591ab3f6df8
4
+ data.tar.gz: 1066345322e342c8f642b89825c1a8555c6bfa1d09985705d1b382654d91653f
5
5
  SHA512:
6
- metadata.gz: 7cd9928cd7a3e7558e27796f0e8f4c7635caa2331dc3f2d2d904434d9feedf6e3d2085ba301b55c61b87bdee83fbc588c666a99c0d0cc0d5414d1614a964a0a5
7
- data.tar.gz: f559bff1acc000be594ad8e91534a4af24acad3ebf80c276b8055af582a91e5137c748acf55267c8001b0c667cc1c6a167ca86a11b34da52ebb796ba8fdd01fe
6
+ metadata.gz: c1a3cf2ec93909993b290c7c6cb0b6e9c6090155657403c705b93b74a538cbe91ff23dead14c33453dde0c31ba681099b3e5c93f2699a471c19c299b43d0f304
7
+ data.tar.gz: e5b456330625bb57a494fb9e5fc9757e96c134da8f410fabe8f7e9d06169d09f0c4fa3c7e5a375870e6f45b8f5ffdf7855b8260719fb1a32846a2f24c18e8853
@@ -38,7 +38,7 @@ rbbt.GE.barcode.mode(#{ R.ruby2R self.data_file }, #{ R.ruby2R outfile }, #{ R.r
38
38
  end
39
39
  end
40
40
  key = key.first if Array === key
41
- [key, bars]
41
+ [key, bars]
42
42
  end
43
43
  end
44
44
 
@@ -32,7 +32,7 @@ module Paths
32
32
  if end_node
33
33
  end_node = end_node.select{|n| parents.keys.include? n}.first unless String === end_node
34
34
  return nil if not parents.include? end_node
35
- extract_path(parents, start_node, u)
35
+ extract_path(parents, start_node, end_node)
36
36
  else
37
37
  parents
38
38
  end
@@ -0,0 +1,43 @@
1
+ require 'rbbt/util/python'
2
+
3
+ module RbbtTensorflow
4
+
5
+ def self.init
6
+ RbbtPython.run do
7
+ pyimport "tensorflow", as: "tf"
8
+ end
9
+ end
10
+
11
+ def self.test
12
+
13
+ mod = x_test = y_test = nil
14
+ RbbtPython.run do
15
+
16
+ mnist_db = tf.keras.datasets.mnist
17
+
18
+ (x_train, y_train), (x_test, y_test) = mnist_db.load_data()
19
+ x_train, x_test = x_train / 255.0, x_test / 255.0
20
+
21
+ mod = tf.keras.models.Sequential.new([
22
+ tf.keras.layers.Flatten.new(input_shape: [28, 28]),
23
+ tf.keras.layers.Dense.new(128, activation:'relu'),
24
+ tf.keras.layers.Dropout.new(0.2),
25
+ tf.keras.layers.Dense.new(10, activation:'softmax')
26
+ ])
27
+ mod.compile(optimizer='adam',
28
+ loss='sparse_categorical_crossentropy',
29
+ metrics=['accuracy'])
30
+ mod.fit(x_train, y_train, epochs:1)
31
+ mod
32
+ end
33
+
34
+ RbbtPython.run do
35
+ mod.evaluate(x_test, y_test, verbose:2)
36
+ end
37
+ end
38
+ end
39
+
40
+ if __FILE__ == $0
41
+ RbbtTensorflow.init
42
+ RbbtTensorflow.test
43
+ end
@@ -4,6 +4,20 @@ class VectorModel
4
4
  attr_accessor :directory, :model_file, :extract_features, :train_model, :eval_model
5
5
  attr_accessor :features, :labels
6
6
 
7
+ def self.R_run(model_file, features, labels, code)
8
+ TmpFile.with_file do |feature_file|
9
+ Open.write(feature_file, features.collect{|feats| feats * "\t"} * "\n")
10
+ Open.write(feature_file + '.class', labels * "\n")
11
+
12
+ R.run <<-EOF
13
+ features = read.table("#{ feature_file }", sep ="\\t", stringsAsFactors=FALSE);
14
+ labels = scan("#{ feature_file }.class");
15
+ features = cbind(features, class = labels);
16
+ #{code}
17
+ EOF
18
+ end
19
+ end
20
+
7
21
  def self.R_train(model_file, features, labels, code)
8
22
  TmpFile.with_file do |feature_file|
9
23
  Open.write(feature_file, features.collect{|feats| feats * "\t"} * "\n")
@@ -32,10 +46,10 @@ save(model, file='#{model_file}')
32
46
  features = read.table("#{ feature_file }", sep ="\\t", stringsAsFactors=FALSE);
33
47
  load(file="#{model_file}");
34
48
  #{code}
35
- cat(paste(label, sep="\\n"));
49
+ cat(paste(label, sep="\\n", collapse="\\n"));
36
50
  EOF
37
-
38
- res = io.read.sub(/WARNING: .*?\n/s,'').split(/\s+/).collect{|l| l.to_f}
51
+ txt = io.read
52
+ res = txt.sub(/WARNING: .*?\n/s,'').split(/\s+/).collect{|l| l.to_f}
39
53
 
40
54
  if list
41
55
  res
@@ -46,13 +60,51 @@ cat(paste(label, sep="\\n"));
46
60
  end
47
61
  end
48
62
 
63
+ def __load_method(file)
64
+ code = Open.read(file)
65
+ code.sub!(/.*Proc\.new/, "Proc.new")
66
+ instance_eval code, file
67
+ end
68
+
49
69
  def initialize(directory, extract_features = nil, train_model = nil, eval_model = nil)
50
70
  @directory = directory
51
71
  FileUtils.mkdir_p @directory unless File.exists? @directory
72
+
52
73
  @model_file = File.join(@directory, "model")
53
- extract_features = @extract_features
54
- train_model = @train_model
55
- eval_model = @eval_model
74
+ @extract_features_file = File.join(@directory, "features")
75
+ @train_model_file = File.join(@directory, "train_model")
76
+ @eval_model_file = File.join(@directory, "eval_model")
77
+ @train_model_file_R = File.join(@directory, "train_model.R")
78
+ @eval_model_file_R = File.join(@directory, "eval_model.R")
79
+
80
+ if extract_features.nil?
81
+ if File.exists?(@extract_features_file)
82
+ @extract_features = __load_method @extract_features_file
83
+ end
84
+ else
85
+ @extract_features = extract_features
86
+ end
87
+
88
+ if train_model.nil?
89
+ if File.exists?(@train_model_file)
90
+ @train_model = __load_method @train_model_file
91
+ elsif File.exists?(@train_model_file_R)
92
+ @train_model = Open.read(@train_model_file_R)
93
+ end
94
+ else
95
+ @train_model = train_model
96
+ end
97
+
98
+ if eval_model.nil?
99
+ if File.exists?(@eval_model_file)
100
+ @eval_model = __load_method @eval_model_file
101
+ elsif File.exists?(@eval_model_file_R)
102
+ @eval_model = Open.read(@eval_model_file_R)
103
+ end
104
+ else
105
+ @eval_model = eval_model
106
+ end
107
+
56
108
  @features = []
57
109
  @labels = []
58
110
  end
@@ -63,8 +115,47 @@ cat(paste(label, sep="\\n"));
63
115
  end
64
116
 
65
117
  def add(element, label = nil)
66
- @features << extract_features.call(element)
67
- @labels << label unless label.nil?
118
+ features = @extract_features ? extract_features.call(element) : element
119
+ @features << features
120
+ @labels << label
121
+ end
122
+
123
+ def add_list(elements, labels = nil)
124
+ if @extract_features.nil? || @extract_features.arity == 1
125
+ elements.zip(labels || [nil]).each do |elem,label|
126
+ add(elem, label)
127
+ end
128
+ else
129
+ features = @extract_features.call(nil, elements)
130
+ @features.concat features
131
+ @labels.concat labels if labels
132
+ end
133
+ end
134
+
135
+ def save_models
136
+ require 'method_source'
137
+
138
+ case
139
+ when Proc === train_model
140
+ begin
141
+ Open.write(@train_model_file, train_model.source)
142
+ rescue
143
+ end
144
+ when String === train_model
145
+ Open.write(@train_model_file_R, @train_model)
146
+ end
147
+
148
+ Open.write(@extract_features_file, @extract_features.source) if @extract_features
149
+
150
+ case
151
+ when Proc === eval_model
152
+ begin
153
+ Open.write(@eval_model_file, eval_model.source)
154
+ rescue
155
+ end
156
+ when String === eval_model
157
+ Open.write(@eval_model_file_R, eval_model)
158
+ end
68
159
  end
69
160
 
70
161
  def train
@@ -72,105 +163,131 @@ cat(paste(label, sep="\\n"));
72
163
  when Proc === train_model
73
164
  train_model.call(@model_file, @features, @labels)
74
165
  when String === train_model
75
- SVMModel.R_train(@model_file, @features, @labels, train_model)
166
+ VectorModel.R_train(@model_file, @features, @labels, train_model)
76
167
  end
168
+ save_models
169
+ end
170
+
171
+ def run(code)
172
+ VectorModel.R_run(@model_file, @features, @labels, code)
77
173
  end
78
174
 
79
175
  def eval(element)
80
176
  case
81
- when Proc === eval_model
82
- eval_model.call(@model_file, extract_features.call(element), false)
83
- when String === eval_model
84
- SVMModel.R_eval(@model_file, extract_features.call(element), false, eval_model)
177
+ when Proc === @eval_model
178
+ @eval_model.call(@model_file, @extract_features.call(element), false)
179
+ when String === @eval_model
180
+ VectorModel.R_eval(@model_file, @extract_features.call(element), false, eval_model)
85
181
  end
86
182
  end
87
183
 
88
184
  def eval_list(elements, extract = true)
185
+
186
+ if extract && ! @extract_features.nil?
187
+ features = if @extract_features.arity == 1
188
+ elements.collect{|element| @extract_features.call(element) }
189
+ else
190
+ @extract_features.call(nil, elements)
191
+ end
192
+ else
193
+ features = elements
194
+ end
195
+
89
196
  case
90
197
  when Proc === eval_model
91
- eval_model.call(@model_file, extract ? elements.collect{|element| extract_features.call(element)} : elements, true)
198
+ eval_model.call(@model_file, features, true)
92
199
  when String === eval_model
93
- SVMModel.R_eval(@model_file, extract ? elements.collect{|element| extract_features.call(element)} : elements, true, eval_model)
200
+ VectorModel.R_eval(@model_file, features, true, eval_model)
94
201
  end
95
202
  end
96
203
 
97
- def cross_validation(folds = 10)
98
- saved_features = @features
99
- saved_labels = @labels
100
- seq = (0..features.length - 1).to_a
204
+ #def cross_validation(folds = 10)
205
+ # saved_features = @features
206
+ # saved_labels = @labels
207
+ # seq = (0..features.length - 1).to_a
101
208
 
102
- chunk_size = features.length / folds
209
+ # chunk_size = features.length / folds
103
210
 
104
- acc = []
105
- folds.times do
106
- seq = seq.shuffle
107
- eval_chunk = seq[0..chunk_size]
108
- train_chunk = seq[chunk_size.. -1]
211
+ # acc = []
212
+ # folds.times do
213
+ # seq = seq.shuffle
214
+ # eval_chunk = seq[0..chunk_size]
215
+ # train_chunk = seq[chunk_size.. -1]
109
216
 
110
- eval_features = @features.values_at *eval_chunk
111
- eval_labels = @labels.values_at *eval_chunk
217
+ # eval_features = @features.values_at *eval_chunk
218
+ # eval_labels = @labels.values_at *eval_chunk
112
219
 
113
- @features = @features.values_at *train_chunk
114
- @labels = @labels.values_at *train_chunk
220
+ # @features = @features.values_at *train_chunk
221
+ # @labels = @labels.values_at *train_chunk
115
222
 
116
- train
117
- predictions = eval_list eval_features, false
223
+ # train
224
+ # predictions = eval_list eval_features, false
118
225
 
119
- acc << predictions.zip(eval_labels).collect{|pred,lab| pred - lab < 0.5 ? 1 : 0}.inject(0){|acc,e| acc +=e} / chunk_size
226
+ # acc << predictions.zip(eval_labels).collect{|pred,lab| pred - lab < 0.5 ? 1 : 0}.inject(0){|acc,e| acc +=e} / chunk_size
120
227
 
121
- @features = saved_features
122
- @labels = saved_labels
123
- end
228
+ # @features = saved_features
229
+ # @labels = saved_labels
230
+ # end
124
231
 
125
- acc
126
- end
232
+ # acc
233
+ #end
127
234
 
128
235
  def cross_validation(folds = 10)
129
236
 
130
237
  res = TSV.setup({}, "Fold~TP,TN,FP,FN,P,R,F1#:type=:list")
131
238
 
132
- feature_folds = Misc.divide(@features, folds)
133
- labels_folds = Misc.divide(@labels, folds)
239
+ orig_features = @features
240
+ orig_labels = @labels
134
241
 
135
- folds.times do |fix|
242
+ begin
243
+ feature_folds = Misc.divide(@features, folds)
244
+ labels_folds = Misc.divide(@labels, folds)
136
245
 
137
- test_set = feature_folds[fix]
138
- train_set = feature_folds.values_at(*((0..9).to_a - [fix])).inject([]){|acc,e| acc += e; acc}
246
+ folds.times do |fix|
139
247
 
140
- test_labels = labels_folds[fix]
141
- train_labels = labels_folds.values_at(*((0..9).to_a - [fix])).flatten
248
+ rest = (0..(folds-1)).to_a - [fix]
142
249
 
143
- tp, fp, tn, fn, pr, re, f1 = [0, 0, 0, 0, nil, nil, nil]
250
+ test_set = feature_folds[fix]
251
+ train_set = feature_folds.values_at(*rest).inject([]){|acc,e| acc += e; acc}
144
252
 
145
- @features = train_set
146
- @labels = train_labels
147
- self.train
148
- predictions = self.eval_list test_set, false
253
+ test_labels = labels_folds[fix]
254
+ train_labels = labels_folds.values_at(*rest).flatten
149
255
 
150
- test_labels.zip(predictions).each do |gs,pred|
151
- gs = gs.to_i
152
- pred = pred > 0.5 ? 1 : 0
153
- tp += 1 if gs == pred && gs == 1
154
- tn += 1 if gs == pred && gs == 0
155
- fp += 1 if gs == 0 && pred == 1
156
- fn += 1 if gs == 1 && pred == 0
157
- end
256
+ tp, fp, tn, fn, pr, re, f1 = [0, 0, 0, 0, nil, nil, nil]
158
257
 
159
- p = tp + fn
160
- pp = tp + fp
258
+ @features = train_set
259
+ @labels = train_labels
260
+ self.train
261
+ predictions = self.eval_list test_set, false
161
262
 
162
- pr = tp.to_f / pp
163
- re = tp.to_f / p
263
+ raise "Number of predictions (#{predictions.length}) and test labels (#{test_labels.length}) do not match" if predictions.length != test_labels.length
164
264
 
165
- f1 = (2.0 * tp) / (2.0 * tp + fp + fn)
265
+ test_labels.zip(predictions).each do |gs,pred|
266
+ gs = gs.to_i
267
+ pred = pred > 0.5 ? 1 : 0
268
+ tp += 1 if gs == pred && gs == 1
269
+ tn += 1 if gs == pred && gs == 0
270
+ fp += 1 if gs == 0 && pred == 1
271
+ fn += 1 if gs == 1 && pred == 0
272
+ end
166
273
 
167
- Misc.fingerprint([tp,tn,fp,fn,pr,re,f1])
274
+ p = tp + fn
275
+ pp = tp + fp
168
276
 
169
- Log.debug "CV Fold #{fix} P:#{"%.3f" % pr} R:#{"%.3f" % re} F1:#{"%.3f" % f1}"
277
+ pr = tp.to_f / pp
278
+ re = tp.to_f / p
170
279
 
171
- res[fix] = [tp,tn,fp,fn,pr,re,f1]
172
- end
280
+ f1 = (2.0 * tp) / (2.0 * tp + fp + fn)
173
281
 
282
+ Log.debug "CV Fold #{fix} P:#{"%.3f" % pr} R:#{"%.3f" % re} F1:#{"%.3f" % f1} - #{[tp.to_s, tn.to_s, fp.to_s, fn.to_s] * " "}"
283
+
284
+ res[fix] = [tp,tn,fp,fn,pr,re,f1]
285
+ end
286
+ ensure
287
+ @features = orig_features
288
+ @labels = orig_labels
289
+ end
290
+ self.train
174
291
  res
175
292
  end
176
293
  end
@@ -0,0 +1,73 @@
1
+ require 'rbbt/vector/model'
2
+ require 'rbbt/nlp/spaCy'
3
+
4
+ class SpaCyModel < VectorModel
5
+ attr_accessor :config
6
+
7
+ def spacy(&block)
8
+ RbbtPython.run "spacy" do
9
+ RbbtPython.module_eval(&block)
10
+ end
11
+ end
12
+
13
+ def initialize(dir, config, lang = 'en_core_web_md')
14
+ @config = case
15
+ when Path === config
16
+ config.read
17
+ when Misc.is_filename?(config)
18
+ Open.read(config)
19
+ when (Misc.is_filename?(config, false) && Rbbt.share.spaCy.cpu[config].exists?)
20
+ Rbbt.share.spaCy.cpu[config].read
21
+ when (Misc.is_filename?(config, false) && Rbbt.share.spaCy[config].exists?)
22
+ Rbbt.share.spaCy[config].read
23
+ else
24
+ config
25
+ end
26
+ @lang = lang
27
+
28
+ super(dir)
29
+
30
+ @train_model = Proc.new do |file, features, labels|
31
+ texts = features
32
+ docs = []
33
+ tmpconfig = File.join(file, 'config')
34
+ tmptrain = File.join(file, 'train.spacy')
35
+ SpaCy.config(@config, tmpconfig)
36
+ spacy do
37
+ nlp = SpaCy.nlp(lang)
38
+ docs = []
39
+ RbbtPython.iterate nlp.pipe(texts.zip(labels), as_tuples: true), :bar => "Training documents into spacy format" do |doc,label|
40
+ if %w(1 true pos).include?(label.to_s.downcase)
41
+ doc.cats["positive"] = 1
42
+ doc.cats["negative"] = 0
43
+ else
44
+ doc.cats["positive"] = 0
45
+ doc.cats["negative"] = 1
46
+ end
47
+ docs << doc
48
+ end
49
+
50
+ doc_bin = spacy.tokens.DocBin.new(docs: docs)
51
+ doc_bin.to_disk(tmptrain)
52
+ end
53
+
54
+ gpu = Rbbt::Config.get('gpu_id', :spacy, :spacy_train)
55
+ CMD.cmd_log(:spacy, "train #{tmpconfig} --output #{file} --paths.train #{tmptrain} --paths.dev #{tmptrain}", "--gpu-id" => gpu)
56
+ end
57
+
58
+ @eval_model = Proc.new do |file, features|
59
+ texts = features
60
+
61
+ docs = []
62
+ spacy do
63
+ nlp = spacy.load("#{file}/model-best")
64
+
65
+ texts.collect do |text|
66
+ cats = nlp.(text).cats
67
+ cats['positive'] > cats['negative'] ? 1 : 0
68
+ end
69
+ end
70
+ end
71
+ end
72
+
73
+ end
@@ -9,7 +9,7 @@ class SVMModel < VectorModel
9
9
 
10
10
  @train_model =<<-EOF
11
11
  library(e1071);
12
- model = svm(class ~ ., data = features, scale=c(0));
12
+ model = svm(as.factor(class) ~ ., data = features);
13
13
  EOF
14
14
 
15
15
  @eval_model =<<-EOF
@@ -0,0 +1,55 @@
1
+ require 'rbbt/vector/model'
2
+ require 'rbbt/tensorflow'
3
+
4
+ class TensorFlowModel < VectorModel
5
+ attr_accessor :graph, :epochs, :compile_options
6
+
7
+ def tensorflow(&block)
8
+ RbbtPython.run "tensorflow" do
9
+ RbbtPython.module_eval(&block)
10
+ end
11
+ end
12
+
13
+ def keras(&block)
14
+ RbbtPython.run "tensorflow.keras", as: 'keras' do
15
+ RbbtPython.run "tensorflow" do
16
+ RbbtPython.module_eval(&block)
17
+ end
18
+ end
19
+ end
20
+
21
+ def initialize(dir, graph = nil, epochs = 3, **compile_options)
22
+ @graph = graph
23
+ @epochs = epochs
24
+ @compile_options = compile_options
25
+
26
+ super(dir)
27
+
28
+ @train_model = Proc.new do |file, features, labels|
29
+ tensorflow do
30
+ features = tensorflow.convert_to_tensor(features)
31
+ labels = tensorflow.convert_to_tensor(labels)
32
+ end
33
+ @graph ||= keras_graph
34
+ @graph.compile(**@compile_options)
35
+ @graph.fit(features, labels, :epochs => @epochs, :verbose => false)
36
+ @graph.save(file)
37
+ end
38
+
39
+ @eval_model = Proc.new do |file, features|
40
+ tensorflow do
41
+ features = tensorflow.convert_to_tensor(features)
42
+ end
43
+ keras do
44
+ @graph ||= keras.models.load_model(file)
45
+ indices = @graph.predict(features, :verbose => false).tolist()
46
+ labels = indices.collect{|p| p.length > 1 ? p.index(p.max): p.first }
47
+ labels
48
+ end
49
+ end
50
+ end
51
+
52
+ def keras_graph(&block)
53
+ @graph = keras(&block)
54
+ end
55
+ end
@@ -11,7 +11,7 @@ class TestBarcode < Test::Unit::TestCase
11
11
  data["G4"] = [6,6,1,1,1,1]
12
12
 
13
13
  TmpFile.with_file(data.to_s) do |file|
14
- m = Matrix.new file
14
+ m = RbbtMatrix.new file
15
15
  m.barcode(file+'.barcode')
16
16
  tsv = TSV.open(file+'.barcode')
17
17
  assert tsv["G2"] = [0,1,0,1,0,1]
@@ -20,7 +20,7 @@ N4 N5
20
20
  end_node = "N5"
21
21
 
22
22
  path = Paths.dijkstra(network, start_node, [end_node])
23
- assert_equal %w(N1 N2 N4), path.reverse
23
+ assert_equal %w(N1 N2 N4 N5), path.reverse
24
24
  end
25
25
 
26
26
  def test_weighted_dijsktra
@@ -0,0 +1,121 @@
1
+ require File.join(File.expand_path(File.dirname(__FILE__)), '../../..', 'test_helper.rb')
2
+ require 'rbbt/vector/model/spaCy'
3
+ require 'rbbt/vector/model/svm'
4
+
5
+ class TestSpaCyModel < Test::Unit::TestCase
6
+
7
+ def test_spyCy
8
+ TmpFile.with_file() do |dir|
9
+ Log.severity = 0
10
+ FileUtils.mkdir_p dir
11
+
12
+ model = SpaCyModel.new(
13
+ dir,
14
+ "cpu/textcat_efficiency.conf"
15
+ )
16
+
17
+
18
+ require 'rbbt/tsv/csv'
19
+ url = "https://raw.githubusercontent.com/hanzhang0420/Women-Clothing-E-commerce/master/Womens%20Clothing%20E-Commerce%20Reviews.csv"
20
+ tsv = TSV.csv(Open.open(url))
21
+ tsv = tsv.reorder("Review Text", ["Recommended IND"]).to_single
22
+
23
+ good = tsv.select("Recommended IND" => '1')
24
+ bad = tsv.select("Recommended IND" => '0')
25
+
26
+ gsize = 2000
27
+ bsize = 500
28
+ good.keys[0..gsize-1].each do |text|
29
+ next if text.nil? || text.empty?
30
+ model.add text, '1'
31
+ end
32
+
33
+ bad.keys[0..bsize-1].each do |text|
34
+ model.add text, '0'
35
+ end
36
+
37
+ model.cross_validation
38
+ end
39
+
40
+ def test_svm_spacy
41
+
42
+ require 'rbbt/tsv/csv'
43
+ url = "https://raw.githubusercontent.com/hanzhang0420/Women-Clothing-E-commerce/master/Womens%20Clothing%20E-Commerce%20Reviews.csv"
44
+ tsv = TSV.csv(Open.open(url))
45
+ tsv = tsv.reorder("Review Text", ["Recommended IND"]).to_single
46
+
47
+ good = tsv.select("Recommended IND" => '1')
48
+ bad = tsv.select("Recommended IND" => '0')
49
+
50
+ gsize = 2000
51
+ bsize = 500
52
+ model = SVMModel.new(
53
+ dir
54
+ )
55
+
56
+ nlp = RbbtPython.run "spacy" do
57
+ spacy.load('en_core_web_md')
58
+ end
59
+
60
+ model.extract_features = Proc.new do |text|
61
+ vs = RbbtPython.run do
62
+ RbbtPython.collect nlp.(text).__iter__ do |token|
63
+ token.vector.tolist()
64
+ end
65
+ end
66
+ length = vs.length
67
+
68
+ v = vs.inject(nil){|acc,ev| acc = acc.nil? ? ev : acc.zip(ev).collect{|a,b| a + b } }
69
+
70
+ v.collect{|e| e / length }
71
+ end
72
+
73
+ TSV.traverse good.keys[0..gsize-1], :type => :array, :bar => true do |text|
74
+ next if text.nil? || text.empty?
75
+ model.add text, '1'
76
+ end
77
+
78
+ TSV.traverse bad.keys[0..bsize-1], :type => :array, :bar => true do |text|
79
+ model.add text, '0'
80
+ end
81
+
82
+ model.cross_validation
83
+
84
+ end
85
+ end
86
+
87
+ def test_spyCy_trf
88
+ TmpFile.with_file() do |dir|
89
+ Log.severity = 0
90
+ FileUtils.mkdir_p dir
91
+
92
+ model = SpaCyModel.new(
93
+ dir,
94
+ "gpu/textcat_accuracy.conf"
95
+ )
96
+
97
+
98
+ require 'rbbt/tsv/csv'
99
+ url = "https://raw.githubusercontent.com/hanzhang0420/Women-Clothing-E-commerce/master/Womens%20Clothing%20E-Commerce%20Reviews.csv"
100
+ tsv = TSV.csv(Open.open(url))
101
+ tsv = tsv.reorder("Review Text", ["Recommended IND"]).to_single
102
+
103
+ good = tsv.select("Recommended IND" => '1')
104
+ bad = tsv.select("Recommended IND" => '0')
105
+
106
+ gsize = 2000
107
+ bsize = 500
108
+ good.keys[0..gsize-1].each do |text|
109
+ next if text.nil? || text.empty?
110
+ model.add text, '1'
111
+ end
112
+
113
+ bad.keys[0..bsize-1].each do |text|
114
+ model.add text, '0'
115
+ end
116
+
117
+ model.cross_validation
118
+ end
119
+ end
120
+ end
121
+
@@ -0,0 +1,57 @@
1
+ require File.join(File.expand_path(File.dirname(__FILE__)), '../../..', 'test_helper.rb')
2
+ require 'rbbt/vector/model/tensorflow'
3
+
4
+ class TestTensorflowModel < Test::Unit::TestCase
5
+
6
+ def test_keras
7
+ TmpFile.with_file() do |dir|
8
+ FileUtils.mkdir_p dir
9
+
10
+ model = TensorFlowModel.new(
11
+ dir,
12
+ optimizer:'adam',
13
+ loss: 'sparse_categorical_crossentropy',
14
+ metrics: ['accuracy']
15
+ )
16
+
17
+ model.keras_graph do
18
+ tf = tensorflow
19
+ tf.keras.models.Sequential.new([
20
+ tf.keras.layers.Flatten.new(input_shape: [28, 28]),
21
+ tf.keras.layers.Dense.new(128, activation:'relu'),
22
+ tf.keras.layers.Dropout.new(0.2),
23
+ tf.keras.layers.Dense.new(10, activation:'softmax')
24
+ ])
25
+ end
26
+
27
+ sum = predictions = nil
28
+ model.tensorflow do
29
+ tf = tensorflow
30
+ mnist_db = tf.keras.datasets.mnist
31
+
32
+ (x_train, y_train), (x_test, y_test) = mnist_db.load_data()
33
+ x_train, x_test = x_train / 255.0, x_test / 255.0
34
+
35
+ num = PyCall.len(x_train)
36
+
37
+ num.times do |i|
38
+ model.add x_train[i], y_train[i]
39
+ end
40
+
41
+ model.train
42
+
43
+ predictions = model.eval_list x_test.tolist()
44
+ sum = 0
45
+ predictions.zip(y_test.tolist()).each do |pred,label|
46
+ sum += 1 if label.to_i == pred
47
+ end
48
+
49
+ end
50
+
51
+ assert sum.to_f / predictions.length > 0.7
52
+
53
+
54
+ end
55
+ end
56
+ end
57
+
@@ -71,4 +71,358 @@ cat(label, file="#{results}");
71
71
  end
72
72
  end
73
73
 
74
+ def test_model_list
75
+ text =<<-EOF
76
+ 1 0;1;1
77
+ 1 1;0;1
78
+ 1 1;1;1
79
+ 1 0;1;1
80
+ 1 1;1;1
81
+ 0 0;1;0
82
+ 0 1;0;0
83
+ 0 0;1;0
84
+ 0 1;0;0
85
+ EOF
86
+
87
+ TmpFile.with_file() do |dir|
88
+ FileUtils.mkdir_p dir
89
+ model = VectorModel.new(dir)
90
+
91
+ model.extract_features = Proc.new{|element,list|
92
+ if element
93
+ element.split(";")
94
+ elsif list
95
+ list.collect{|e| e.split(";") }
96
+ end
97
+ }
98
+
99
+ model.train_model = Proc.new{|model_file,features,labels|
100
+ TmpFile.with_file do |feature_file|
101
+ Open.write(feature_file, features.collect{|feats| feats * "\t"} * "\n")
102
+ Open.write(feature_file + '.class', labels * "\n")
103
+ R.run <<-EOF
104
+ features = read.table("#{ feature_file }", sep ="\\t", stringsAsFactors=FALSE);
105
+ labels = scan("#{ feature_file }.class", what=numeric());
106
+ features = cbind(features, class = labels);
107
+ rbbt.require('e1071')
108
+ model = svm(class ~ ., data = features)
109
+ save(model, file="#{ model_file }");
110
+ EOF
111
+ end
112
+ }
113
+
114
+ model.eval_model = Proc.new{|model_file,features|
115
+ TmpFile.with_file do |feature_file|
116
+ TmpFile.with_file do |results|
117
+ Open.write(feature_file, features * "\t")
118
+ puts R.run(<<-EOF
119
+ features = read.table("#{ feature_file }", sep ="\\t", stringsAsFactors=FALSE);
120
+ library(e1071)
121
+ load(file="#{ model_file }")
122
+ label = predict(model, features);
123
+ cat(label, file="#{results}");
124
+ EOF
125
+ ).read
126
+ Open.read(results)
127
+ end
128
+ end
129
+
130
+ }
131
+
132
+ pairs = text.split(/\n/).collect do |line|
133
+ label, features = line.split(" ")
134
+ [features, label]
135
+ end
136
+
137
+ model.add_list(*Misc.zip_fields(pairs))
138
+
139
+ model.train
140
+
141
+ assert model.eval("1;1;1").to_f > 0.5
142
+ assert model.eval("0;0;0").to_f < 0.5
143
+ end
144
+ end
145
+
146
+ def test_model_list2
147
+ text =<<-EOF
148
+ 1 0;1;1
149
+ 1 1;0;1
150
+ 1 1;1;1
151
+ 1 0;1;1
152
+ 1 1;1;1
153
+ 0 0;1;0
154
+ 0 1;0;0
155
+ 0 0;1;0
156
+ 0 1;0;0
157
+ EOF
158
+
159
+ TmpFile.with_file() do |dir|
160
+ FileUtils.mkdir_p dir
161
+ model = VectorModel.new(dir)
162
+
163
+ model.extract_features = Proc.new{|element|
164
+ element.split(";")
165
+ }
166
+
167
+ model.train_model = Proc.new{|model_file,features,labels|
168
+ TmpFile.with_file do |feature_file|
169
+ Open.write(feature_file, features.collect{|feats| feats * "\t"} * "\n")
170
+ Open.write(feature_file + '.class', labels * "\n")
171
+ R.run <<-EOF
172
+ features = read.table("#{ feature_file }", sep ="\\t", stringsAsFactors=FALSE);
173
+ labels = scan("#{ feature_file }.class", what=numeric());
174
+ features = cbind(features, class = labels);
175
+ rbbt.require('e1071')
176
+ model = svm(class ~ ., data = features)
177
+ save(model, file="#{ model_file }");
178
+ EOF
179
+ end
180
+ }
181
+
182
+ model.eval_model = Proc.new{|model_file,features|
183
+ TmpFile.with_file do |feature_file|
184
+ TmpFile.with_file do |results|
185
+ Open.write(feature_file, features * "\t")
186
+ puts R.run(<<-EOF
187
+ features = read.table("#{ feature_file }", sep ="\\t", stringsAsFactors=FALSE);
188
+ library(e1071)
189
+ load(file="#{ model_file }")
190
+ label = predict(model, features);
191
+ cat(label, file="#{results}");
192
+ EOF
193
+ ).read
194
+ Open.read(results)
195
+ end
196
+ end
197
+
198
+ }
199
+
200
+ pairs = text.split(/\n/).collect do |line|
201
+ label, features = line.split(" ")
202
+ [features, label]
203
+ end
204
+
205
+ model.add_list(*Misc.zip_fields(pairs))
206
+
207
+ model.train
208
+
209
+ assert model.eval("1;1;1").to_f > 0.5
210
+ assert model.eval("0;0;0").to_f < 0.5
211
+ end
212
+ end
213
+
214
+ def test_model_list
215
+ text =<<-EOF
216
+ 1 0;1;1
217
+ 1 1;0;1
218
+ 1 1;1;1
219
+ 1 0;1;1
220
+ 1 1;1;1
221
+ 0 0;1;0
222
+ 0 1;0;0
223
+ 0 0;1;0
224
+ 0 1;0;0
225
+ EOF
226
+
227
+ TmpFile.with_file() do |dir|
228
+ FileUtils.mkdir_p dir
229
+ model = VectorModel.new(dir)
230
+
231
+ model.extract_features = Proc.new{|element,list|
232
+ if element
233
+ element.split(";")
234
+ elsif list
235
+ list.collect{|e| e.split(";") }
236
+ end
237
+ }
238
+
239
+ model.train_model = Proc.new{|model_file,features,labels|
240
+ TmpFile.with_file do |feature_file|
241
+ Open.write(feature_file, features.collect{|feats| feats * "\t"} * "\n")
242
+ Open.write(feature_file + '.class', labels * "\n")
243
+ R.run <<-EOF
244
+ features = read.table("#{ feature_file }", sep ="\\t", stringsAsFactors=FALSE);
245
+ labels = scan("#{ feature_file }.class", what=numeric());
246
+ features = cbind(features, class = labels);
247
+ rbbt.require('e1071')
248
+ model = svm(class ~ ., data = features)
249
+ save(model, file="#{ model_file }");
250
+ EOF
251
+ end
252
+ }
253
+
254
+ model.eval_model = Proc.new{|model_file,features|
255
+ TmpFile.with_file do |feature_file|
256
+ TmpFile.with_file do |results|
257
+ Open.write(feature_file, features * "\t")
258
+ puts R.run(<<-EOF
259
+ features = read.table("#{ feature_file }", sep ="\\t", stringsAsFactors=FALSE);
260
+ library(e1071)
261
+ load(file="#{ model_file }")
262
+ label = predict(model, features);
263
+ cat(label, file="#{results}");
264
+ EOF
265
+ ).read
266
+ Open.read(results)
267
+ end
268
+ end
269
+
270
+ }
271
+
272
+ pairs = text.split(/\n/).collect do |line|
273
+ label, features = line.split(" ")
274
+ model.add features, label
275
+ end
276
+
277
+ model.train
278
+
279
+ assert model.eval("1;1;1").to_f > 0.5
280
+ assert model.eval("0;0;0").to_f < 0.5
281
+ end
282
+ end
283
+
284
+ def test_model_save
285
+ text =<<-EOF
286
+ 1 0;1;1
287
+ 1 1;0;1
288
+ 1 1;1;1
289
+ 1 0;1;1
290
+ 1 1;1;1
291
+ 0 0;1;0
292
+ 0 1;0;0
293
+ 0 0;1;0
294
+ 0 1;0;0
295
+ EOF
296
+
297
+ TmpFile.with_file() do |dir|
298
+ FileUtils.mkdir_p dir
299
+ model = VectorModel.new(dir)
300
+
301
+ model.extract_features = Proc.new{|element|
302
+ element.split(";")
303
+ }
304
+
305
+ model.train_model = Proc.new{|model_file,features,labels|
306
+ TmpFile.with_file do |feature_file|
307
+ Open.write(feature_file, features.collect{|feats| feats * "\t"} * "\n")
308
+ Open.write(feature_file + '.class', labels * "\n")
309
+ R.run <<-EOF
310
+ features = read.table("#{ feature_file }", sep ="\\t", stringsAsFactors=FALSE);
311
+ labels = scan("#{ feature_file }.class", what=numeric());
312
+ features = cbind(features, class = labels);
313
+ rbbt.require('e1071')
314
+ model = svm(class ~ ., data = features)
315
+ save(model, file="#{ model_file }");
316
+ EOF
317
+ end
318
+ }
319
+
320
+ model.eval_model = Proc.new{|model_file,features|
321
+ TmpFile.with_file do |feature_file|
322
+ TmpFile.with_file do |results|
323
+ Open.write(feature_file, features * "\t")
324
+ puts R.run(<<-EOF
325
+ features = read.table("#{ feature_file }", sep ="\\t", stringsAsFactors=FALSE);
326
+ library(e1071)
327
+ load(file="#{ model_file }")
328
+ label = predict(model, features);
329
+ cat(label, file="#{results}");
330
+ EOF
331
+ ).read
332
+ Open.read(results)
333
+ end
334
+ end
335
+
336
+ }
337
+
338
+ pairs = text.split(/\n/).collect do |line|
339
+ label, features = line.split(" ")
340
+ [features, label]
341
+ end
342
+
343
+ model.add_list(*Misc.zip_fields(pairs))
344
+
345
+ model.train
346
+
347
+ assert model.eval("1;1;1").to_f > 0.5
348
+ assert model.eval("0;0;0").to_f < 0.5
349
+ end
350
+ end
351
+
352
+ def test_model_save
353
+ text =<<-EOF
354
+ 1 0;1;1
355
+ 1 1;0;1
356
+ 1 1;1;1
357
+ 1 0;1;1
358
+ 1 1;1;1
359
+ 0 0;1;0
360
+ 0 1;0;0
361
+ 0 0;1;0
362
+ 0 1;0;0
363
+ EOF
364
+
365
+ TmpFile.with_file() do |dir|
366
+ FileUtils.mkdir_p dir
367
+ model = VectorModel.new(dir)
368
+
369
+ model.extract_features = Proc.new{|element,list|
370
+ if element
371
+ element.split(";")
372
+ elsif list
373
+ list.collect{|e| e.split(";") }
374
+ end
375
+ }
376
+
377
+ model.train_model = Proc.new{|model_file,features,labels|
378
+ TmpFile.with_file do |feature_file|
379
+ Open.write(feature_file, features.collect{|feats| feats * "\t"} * "\n")
380
+ Open.write(feature_file + '.class', labels * "\n")
381
+ R.run <<-EOF
382
+ features = read.table("#{ feature_file }", sep ="\\t", stringsAsFactors=FALSE);
383
+ labels = scan("#{ feature_file }.class", what=numeric());
384
+ features = cbind(features, class = labels);
385
+ rbbt.require('e1071')
386
+ model = svm(class ~ ., data = features)
387
+ save(model, file="#{ model_file }");
388
+ EOF
389
+ end
390
+ }
391
+
392
+ model.eval_model = Proc.new{|model_file,features|
393
+ TmpFile.with_file do |feature_file|
394
+ TmpFile.with_file do |results|
395
+ Open.write(feature_file, features * "\t")
396
+ puts R.run(<<-EOF
397
+ features = read.table("#{ feature_file }", sep ="\\t", stringsAsFactors=FALSE);
398
+ library(e1071)
399
+ load(file="#{ model_file }")
400
+ label = predict(model, features);
401
+ cat(label, file="#{results}");
402
+ EOF
403
+ ).read
404
+ Open.read(results)
405
+ end
406
+ end
407
+
408
+ }
409
+
410
+ pairs = text.split(/\n/).collect do |line|
411
+ label, features = line.split(" ")
412
+ model.add features, label
413
+ end
414
+
415
+ model.train
416
+
417
+ model = VectorModel.new(dir)
418
+ pairs = text.split(/\n/).collect do |line|
419
+ label, features = line.split(" ")
420
+ model.add features, label
421
+ end
422
+
423
+ assert model.eval("1;1;1").to_f > 0.5
424
+ assert model.eval("0;0;0").to_f < 0.5
425
+ end
426
+ end
427
+
74
428
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: rbbt-dm
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.1.46
4
+ version: 1.1.51
5
5
  platform: ruby
6
6
  authors:
7
7
  - Miguel Vazquez
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2020-01-30 00:00:00.000000000 Z
11
+ date: 2021-06-23 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: rbbt-util
@@ -39,7 +39,7 @@ dependencies:
39
39
  - !ruby/object:Gem::Version
40
40
  version: '0'
41
41
  - !ruby/object:Gem::Dependency
42
- name: priority_queue_cxx
42
+ name: priority_queue_cxx17
43
43
  requirement: !ruby/object:Gem::Requirement
44
44
  requirements:
45
45
  - - ">="
@@ -105,8 +105,11 @@ files:
105
105
  - lib/rbbt/statistics/hypergeometric.rb
106
106
  - lib/rbbt/statistics/random_walk.rb
107
107
  - lib/rbbt/statistics/rank_product.rb
108
+ - lib/rbbt/tensorflow.rb
108
109
  - lib/rbbt/vector/model.rb
110
+ - lib/rbbt/vector/model/spaCy.rb
109
111
  - lib/rbbt/vector/model/svm.rb
112
+ - lib/rbbt/vector/model/tensorflow.rb
110
113
  - share/R/MA.R
111
114
  - share/R/barcode.R
112
115
  - share/R/heatmap.3.R
@@ -118,7 +121,9 @@ files:
118
121
  - test/rbbt/statistics/test_random_walk.rb
119
122
  - test/rbbt/test_ml_task.rb
120
123
  - test/rbbt/test_stan.rb
124
+ - test/rbbt/vector/model/test_spaCy.rb
121
125
  - test/rbbt/vector/model/test_svm.rb
126
+ - test/rbbt/vector/model/test_tensorflow.rb
122
127
  - test/rbbt/vector/test_model.rb
123
128
  - test/test_helper.rb
124
129
  homepage: http://github.com/mikisvaz/rbbt-phgx
@@ -139,7 +144,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
139
144
  - !ruby/object:Gem::Version
140
145
  version: '0'
141
146
  requirements: []
142
- rubygems_version: 3.0.6
147
+ rubygems_version: 3.1.4
143
148
  signing_key:
144
149
  specification_version: 4
145
150
  summary: Data-mining and statistics
@@ -152,6 +157,8 @@ test_files:
152
157
  - test/rbbt/statistics/test_hypergeometric.rb
153
158
  - test/rbbt/test_ml_task.rb
154
159
  - test/rbbt/vector/test_model.rb
160
+ - test/rbbt/vector/model/test_spaCy.rb
161
+ - test/rbbt/vector/model/test_tensorflow.rb
155
162
  - test/rbbt/vector/model/test_svm.rb
156
163
  - test/rbbt/test_stan.rb
157
164
  - test/test_helper.rb