rbbt-dm 1.1.46 → 1.1.51

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 5f70d6a55c5852ffdafd59a3199b41031ee5a5aaa4becc56ee2f7f49be3b5d43
4
- data.tar.gz: f5d53ca95b6af4d6c2b63f00bbbfd398e3fa5a6feea15885b00ccd946f3aa765
3
+ metadata.gz: 5327d1c2a46283b95fa380a73be418cf7e5a3afd2aca6002bd9ca591ab3f6df8
4
+ data.tar.gz: 1066345322e342c8f642b89825c1a8555c6bfa1d09985705d1b382654d91653f
5
5
  SHA512:
6
- metadata.gz: 7cd9928cd7a3e7558e27796f0e8f4c7635caa2331dc3f2d2d904434d9feedf6e3d2085ba301b55c61b87bdee83fbc588c666a99c0d0cc0d5414d1614a964a0a5
7
- data.tar.gz: f559bff1acc000be594ad8e91534a4af24acad3ebf80c276b8055af582a91e5137c748acf55267c8001b0c667cc1c6a167ca86a11b34da52ebb796ba8fdd01fe
6
+ metadata.gz: c1a3cf2ec93909993b290c7c6cb0b6e9c6090155657403c705b93b74a538cbe91ff23dead14c33453dde0c31ba681099b3e5c93f2699a471c19c299b43d0f304
7
+ data.tar.gz: e5b456330625bb57a494fb9e5fc9757e96c134da8f410fabe8f7e9d06169d09f0c4fa3c7e5a375870e6f45b8f5ffdf7855b8260719fb1a32846a2f24c18e8853
@@ -38,7 +38,7 @@ rbbt.GE.barcode.mode(#{ R.ruby2R self.data_file }, #{ R.ruby2R outfile }, #{ R.r
38
38
  end
39
39
  end
40
40
  key = key.first if Array === key
41
- [key, bars]
41
+ [key, bars]
42
42
  end
43
43
  end
44
44
 
@@ -32,7 +32,7 @@ module Paths
32
32
  if end_node
33
33
  end_node = end_node.select{|n| parents.keys.include? n}.first unless String === end_node
34
34
  return nil if not parents.include? end_node
35
- extract_path(parents, start_node, u)
35
+ extract_path(parents, start_node, end_node)
36
36
  else
37
37
  parents
38
38
  end
@@ -0,0 +1,43 @@
1
+ require 'rbbt/util/python'
2
+
3
+ module RbbtTensorflow
4
+
5
+ def self.init
6
+ RbbtPython.run do
7
+ pyimport "tensorflow", as: "tf"
8
+ end
9
+ end
10
+
11
+ def self.test
12
+
13
+ mod = x_test = y_test = nil
14
+ RbbtPython.run do
15
+
16
+ mnist_db = tf.keras.datasets.mnist
17
+
18
+ (x_train, y_train), (x_test, y_test) = mnist_db.load_data()
19
+ x_train, x_test = x_train / 255.0, x_test / 255.0
20
+
21
+ mod = tf.keras.models.Sequential.new([
22
+ tf.keras.layers.Flatten.new(input_shape: [28, 28]),
23
+ tf.keras.layers.Dense.new(128, activation:'relu'),
24
+ tf.keras.layers.Dropout.new(0.2),
25
+ tf.keras.layers.Dense.new(10, activation:'softmax')
26
+ ])
27
+ mod.compile(optimizer='adam',
28
+ loss='sparse_categorical_crossentropy',
29
+ metrics=['accuracy'])
30
+ mod.fit(x_train, y_train, epochs:1)
31
+ mod
32
+ end
33
+
34
+ RbbtPython.run do
35
+ mod.evaluate(x_test, y_test, verbose:2)
36
+ end
37
+ end
38
+ end
39
+
40
+ if __FILE__ == $0
41
+ RbbtTensorflow.init
42
+ RbbtTensorflow.test
43
+ end
@@ -4,6 +4,20 @@ class VectorModel
4
4
  attr_accessor :directory, :model_file, :extract_features, :train_model, :eval_model
5
5
  attr_accessor :features, :labels
6
6
 
7
+ def self.R_run(model_file, features, labels, code)
8
+ TmpFile.with_file do |feature_file|
9
+ Open.write(feature_file, features.collect{|feats| feats * "\t"} * "\n")
10
+ Open.write(feature_file + '.class', labels * "\n")
11
+
12
+ R.run <<-EOF
13
+ features = read.table("#{ feature_file }", sep ="\\t", stringsAsFactors=FALSE);
14
+ labels = scan("#{ feature_file }.class");
15
+ features = cbind(features, class = labels);
16
+ #{code}
17
+ EOF
18
+ end
19
+ end
20
+
7
21
  def self.R_train(model_file, features, labels, code)
8
22
  TmpFile.with_file do |feature_file|
9
23
  Open.write(feature_file, features.collect{|feats| feats * "\t"} * "\n")
@@ -32,10 +46,10 @@ save(model, file='#{model_file}')
32
46
  features = read.table("#{ feature_file }", sep ="\\t", stringsAsFactors=FALSE);
33
47
  load(file="#{model_file}");
34
48
  #{code}
35
- cat(paste(label, sep="\\n"));
49
+ cat(paste(label, sep="\\n", collapse="\\n"));
36
50
  EOF
37
-
38
- res = io.read.sub(/WARNING: .*?\n/s,'').split(/\s+/).collect{|l| l.to_f}
51
+ txt = io.read
52
+ res = txt.sub(/WARNING: .*?\n/s,'').split(/\s+/).collect{|l| l.to_f}
39
53
 
40
54
  if list
41
55
  res
@@ -46,13 +60,51 @@ cat(paste(label, sep="\\n"));
46
60
  end
47
61
  end
48
62
 
63
+ def __load_method(file)
64
+ code = Open.read(file)
65
+ code.sub!(/.*Proc\.new/, "Proc.new")
66
+ instance_eval code, file
67
+ end
68
+
49
69
  def initialize(directory, extract_features = nil, train_model = nil, eval_model = nil)
50
70
  @directory = directory
51
71
  FileUtils.mkdir_p @directory unless File.exists? @directory
72
+
52
73
  @model_file = File.join(@directory, "model")
53
- extract_features = @extract_features
54
- train_model = @train_model
55
- eval_model = @eval_model
74
+ @extract_features_file = File.join(@directory, "features")
75
+ @train_model_file = File.join(@directory, "train_model")
76
+ @eval_model_file = File.join(@directory, "eval_model")
77
+ @train_model_file_R = File.join(@directory, "train_model.R")
78
+ @eval_model_file_R = File.join(@directory, "eval_model.R")
79
+
80
+ if extract_features.nil?
81
+ if File.exists?(@extract_features_file)
82
+ @extract_features = __load_method @extract_features_file
83
+ end
84
+ else
85
+ @extract_features = extract_features
86
+ end
87
+
88
+ if train_model.nil?
89
+ if File.exists?(@train_model_file)
90
+ @train_model = __load_method @train_model_file
91
+ elsif File.exists?(@train_model_file_R)
92
+ @train_model = Open.read(@train_model_file_R)
93
+ end
94
+ else
95
+ @train_model = train_model
96
+ end
97
+
98
+ if eval_model.nil?
99
+ if File.exists?(@eval_model_file)
100
+ @eval_model = __load_method @eval_model_file
101
+ elsif File.exists?(@eval_model_file_R)
102
+ @eval_model = Open.read(@eval_model_file_R)
103
+ end
104
+ else
105
+ @eval_model = eval_model
106
+ end
107
+
56
108
  @features = []
57
109
  @labels = []
58
110
  end
@@ -63,8 +115,47 @@ cat(paste(label, sep="\\n"));
63
115
  end
64
116
 
65
117
  def add(element, label = nil)
66
- @features << extract_features.call(element)
67
- @labels << label unless label.nil?
118
+ features = @extract_features ? extract_features.call(element) : element
119
+ @features << features
120
+ @labels << label
121
+ end
122
+
123
+ def add_list(elements, labels = nil)
124
+ if @extract_features.nil? || @extract_features.arity == 1
125
+ elements.zip(labels || [nil]).each do |elem,label|
126
+ add(elem, label)
127
+ end
128
+ else
129
+ features = @extract_features.call(nil, elements)
130
+ @features.concat features
131
+ @labels.concat labels if labels
132
+ end
133
+ end
134
+
135
+ def save_models
136
+ require 'method_source'
137
+
138
+ case
139
+ when Proc === train_model
140
+ begin
141
+ Open.write(@train_model_file, train_model.source)
142
+ rescue
143
+ end
144
+ when String === train_model
145
+ Open.write(@train_model_file_R, @train_model)
146
+ end
147
+
148
+ Open.write(@extract_features_file, @extract_features.source) if @extract_features
149
+
150
+ case
151
+ when Proc === eval_model
152
+ begin
153
+ Open.write(@eval_model_file, eval_model.source)
154
+ rescue
155
+ end
156
+ when String === eval_model
157
+ Open.write(@eval_model_file_R, eval_model)
158
+ end
68
159
  end
69
160
 
70
161
  def train
@@ -72,105 +163,131 @@ cat(paste(label, sep="\\n"));
72
163
  when Proc === train_model
73
164
  train_model.call(@model_file, @features, @labels)
74
165
  when String === train_model
75
- SVMModel.R_train(@model_file, @features, @labels, train_model)
166
+ VectorModel.R_train(@model_file, @features, @labels, train_model)
76
167
  end
168
+ save_models
169
+ end
170
+
171
+ def run(code)
172
+ VectorModel.R_run(@model_file, @features, @labels, code)
77
173
  end
78
174
 
79
175
  def eval(element)
80
176
  case
81
- when Proc === eval_model
82
- eval_model.call(@model_file, extract_features.call(element), false)
83
- when String === eval_model
84
- SVMModel.R_eval(@model_file, extract_features.call(element), false, eval_model)
177
+ when Proc === @eval_model
178
+ @eval_model.call(@model_file, @extract_features.call(element), false)
179
+ when String === @eval_model
180
+ VectorModel.R_eval(@model_file, @extract_features.call(element), false, eval_model)
85
181
  end
86
182
  end
87
183
 
88
184
  def eval_list(elements, extract = true)
185
+
186
+ if extract && ! @extract_features.nil?
187
+ features = if @extract_features.arity == 1
188
+ elements.collect{|element| @extract_features.call(element) }
189
+ else
190
+ @extract_features.call(nil, elements)
191
+ end
192
+ else
193
+ features = elements
194
+ end
195
+
89
196
  case
90
197
  when Proc === eval_model
91
- eval_model.call(@model_file, extract ? elements.collect{|element| extract_features.call(element)} : elements, true)
198
+ eval_model.call(@model_file, features, true)
92
199
  when String === eval_model
93
- SVMModel.R_eval(@model_file, extract ? elements.collect{|element| extract_features.call(element)} : elements, true, eval_model)
200
+ VectorModel.R_eval(@model_file, features, true, eval_model)
94
201
  end
95
202
  end
96
203
 
97
- def cross_validation(folds = 10)
98
- saved_features = @features
99
- saved_labels = @labels
100
- seq = (0..features.length - 1).to_a
204
+ #def cross_validation(folds = 10)
205
+ # saved_features = @features
206
+ # saved_labels = @labels
207
+ # seq = (0..features.length - 1).to_a
101
208
 
102
- chunk_size = features.length / folds
209
+ # chunk_size = features.length / folds
103
210
 
104
- acc = []
105
- folds.times do
106
- seq = seq.shuffle
107
- eval_chunk = seq[0..chunk_size]
108
- train_chunk = seq[chunk_size.. -1]
211
+ # acc = []
212
+ # folds.times do
213
+ # seq = seq.shuffle
214
+ # eval_chunk = seq[0..chunk_size]
215
+ # train_chunk = seq[chunk_size.. -1]
109
216
 
110
- eval_features = @features.values_at *eval_chunk
111
- eval_labels = @labels.values_at *eval_chunk
217
+ # eval_features = @features.values_at *eval_chunk
218
+ # eval_labels = @labels.values_at *eval_chunk
112
219
 
113
- @features = @features.values_at *train_chunk
114
- @labels = @labels.values_at *train_chunk
220
+ # @features = @features.values_at *train_chunk
221
+ # @labels = @labels.values_at *train_chunk
115
222
 
116
- train
117
- predictions = eval_list eval_features, false
223
+ # train
224
+ # predictions = eval_list eval_features, false
118
225
 
119
- acc << predictions.zip(eval_labels).collect{|pred,lab| pred - lab < 0.5 ? 1 : 0}.inject(0){|acc,e| acc +=e} / chunk_size
226
+ # acc << predictions.zip(eval_labels).collect{|pred,lab| pred - lab < 0.5 ? 1 : 0}.inject(0){|acc,e| acc +=e} / chunk_size
120
227
 
121
- @features = saved_features
122
- @labels = saved_labels
123
- end
228
+ # @features = saved_features
229
+ # @labels = saved_labels
230
+ # end
124
231
 
125
- acc
126
- end
232
+ # acc
233
+ #end
127
234
 
128
235
  def cross_validation(folds = 10)
129
236
 
130
237
  res = TSV.setup({}, "Fold~TP,TN,FP,FN,P,R,F1#:type=:list")
131
238
 
132
- feature_folds = Misc.divide(@features, folds)
133
- labels_folds = Misc.divide(@labels, folds)
239
+ orig_features = @features
240
+ orig_labels = @labels
134
241
 
135
- folds.times do |fix|
242
+ begin
243
+ feature_folds = Misc.divide(@features, folds)
244
+ labels_folds = Misc.divide(@labels, folds)
136
245
 
137
- test_set = feature_folds[fix]
138
- train_set = feature_folds.values_at(*((0..9).to_a - [fix])).inject([]){|acc,e| acc += e; acc}
246
+ folds.times do |fix|
139
247
 
140
- test_labels = labels_folds[fix]
141
- train_labels = labels_folds.values_at(*((0..9).to_a - [fix])).flatten
248
+ rest = (0..(folds-1)).to_a - [fix]
142
249
 
143
- tp, fp, tn, fn, pr, re, f1 = [0, 0, 0, 0, nil, nil, nil]
250
+ test_set = feature_folds[fix]
251
+ train_set = feature_folds.values_at(*rest).inject([]){|acc,e| acc += e; acc}
144
252
 
145
- @features = train_set
146
- @labels = train_labels
147
- self.train
148
- predictions = self.eval_list test_set, false
253
+ test_labels = labels_folds[fix]
254
+ train_labels = labels_folds.values_at(*rest).flatten
149
255
 
150
- test_labels.zip(predictions).each do |gs,pred|
151
- gs = gs.to_i
152
- pred = pred > 0.5 ? 1 : 0
153
- tp += 1 if gs == pred && gs == 1
154
- tn += 1 if gs == pred && gs == 0
155
- fp += 1 if gs == 0 && pred == 1
156
- fn += 1 if gs == 1 && pred == 0
157
- end
256
+ tp, fp, tn, fn, pr, re, f1 = [0, 0, 0, 0, nil, nil, nil]
158
257
 
159
- p = tp + fn
160
- pp = tp + fp
258
+ @features = train_set
259
+ @labels = train_labels
260
+ self.train
261
+ predictions = self.eval_list test_set, false
161
262
 
162
- pr = tp.to_f / pp
163
- re = tp.to_f / p
263
+ raise "Number of predictions (#{predictions.length}) and test labels (#{test_labels.length}) do not match" if predictions.length != test_labels.length
164
264
 
165
- f1 = (2.0 * tp) / (2.0 * tp + fp + fn)
265
+ test_labels.zip(predictions).each do |gs,pred|
266
+ gs = gs.to_i
267
+ pred = pred > 0.5 ? 1 : 0
268
+ tp += 1 if gs == pred && gs == 1
269
+ tn += 1 if gs == pred && gs == 0
270
+ fp += 1 if gs == 0 && pred == 1
271
+ fn += 1 if gs == 1 && pred == 0
272
+ end
166
273
 
167
- Misc.fingerprint([tp,tn,fp,fn,pr,re,f1])
274
+ p = tp + fn
275
+ pp = tp + fp
168
276
 
169
- Log.debug "CV Fold #{fix} P:#{"%.3f" % pr} R:#{"%.3f" % re} F1:#{"%.3f" % f1}"
277
+ pr = tp.to_f / pp
278
+ re = tp.to_f / p
170
279
 
171
- res[fix] = [tp,tn,fp,fn,pr,re,f1]
172
- end
280
+ f1 = (2.0 * tp) / (2.0 * tp + fp + fn)
173
281
 
282
+ Log.debug "CV Fold #{fix} P:#{"%.3f" % pr} R:#{"%.3f" % re} F1:#{"%.3f" % f1} - #{[tp.to_s, tn.to_s, fp.to_s, fn.to_s] * " "}"
283
+
284
+ res[fix] = [tp,tn,fp,fn,pr,re,f1]
285
+ end
286
+ ensure
287
+ @features = orig_features
288
+ @labels = orig_labels
289
+ end
290
+ self.train
174
291
  res
175
292
  end
176
293
  end
@@ -0,0 +1,73 @@
1
+ require 'rbbt/vector/model'
2
+ require 'rbbt/nlp/spaCy'
3
+
4
+ class SpaCyModel < VectorModel
5
+ attr_accessor :config
6
+
7
+ def spacy(&block)
8
+ RbbtPython.run "spacy" do
9
+ RbbtPython.module_eval(&block)
10
+ end
11
+ end
12
+
13
+ def initialize(dir, config, lang = 'en_core_web_md')
14
+ @config = case
15
+ when Path === config
16
+ config.read
17
+ when Misc.is_filename?(config)
18
+ Open.read(config)
19
+ when (Misc.is_filename?(config, false) && Rbbt.share.spaCy.cpu[config].exists?)
20
+ Rbbt.share.spaCy.cpu[config].read
21
+ when (Misc.is_filename?(config, false) && Rbbt.share.spaCy[config].exists?)
22
+ Rbbt.share.spaCy[config].read
23
+ else
24
+ config
25
+ end
26
+ @lang = lang
27
+
28
+ super(dir)
29
+
30
+ @train_model = Proc.new do |file, features, labels|
31
+ texts = features
32
+ docs = []
33
+ tmpconfig = File.join(file, 'config')
34
+ tmptrain = File.join(file, 'train.spacy')
35
+ SpaCy.config(@config, tmpconfig)
36
+ spacy do
37
+ nlp = SpaCy.nlp(lang)
38
+ docs = []
39
+ RbbtPython.iterate nlp.pipe(texts.zip(labels), as_tuples: true), :bar => "Training documents into spacy format" do |doc,label|
40
+ if %w(1 true pos).include?(label.to_s.downcase)
41
+ doc.cats["positive"] = 1
42
+ doc.cats["negative"] = 0
43
+ else
44
+ doc.cats["positive"] = 0
45
+ doc.cats["negative"] = 1
46
+ end
47
+ docs << doc
48
+ end
49
+
50
+ doc_bin = spacy.tokens.DocBin.new(docs: docs)
51
+ doc_bin.to_disk(tmptrain)
52
+ end
53
+
54
+ gpu = Rbbt::Config.get('gpu_id', :spacy, :spacy_train)
55
+ CMD.cmd_log(:spacy, "train #{tmpconfig} --output #{file} --paths.train #{tmptrain} --paths.dev #{tmptrain}", "--gpu-id" => gpu)
56
+ end
57
+
58
+ @eval_model = Proc.new do |file, features|
59
+ texts = features
60
+
61
+ docs = []
62
+ spacy do
63
+ nlp = spacy.load("#{file}/model-best")
64
+
65
+ texts.collect do |text|
66
+ cats = nlp.(text).cats
67
+ cats['positive'] > cats['negative'] ? 1 : 0
68
+ end
69
+ end
70
+ end
71
+ end
72
+
73
+ end
@@ -9,7 +9,7 @@ class SVMModel < VectorModel
9
9
 
10
10
  @train_model =<<-EOF
11
11
  library(e1071);
12
- model = svm(class ~ ., data = features, scale=c(0));
12
+ model = svm(as.factor(class) ~ ., data = features);
13
13
  EOF
14
14
 
15
15
  @eval_model =<<-EOF
@@ -0,0 +1,55 @@
1
+ require 'rbbt/vector/model'
2
+ require 'rbbt/tensorflow'
3
+
4
+ class TensorFlowModel < VectorModel
5
+ attr_accessor :graph, :epochs, :compile_options
6
+
7
+ def tensorflow(&block)
8
+ RbbtPython.run "tensorflow" do
9
+ RbbtPython.module_eval(&block)
10
+ end
11
+ end
12
+
13
+ def keras(&block)
14
+ RbbtPython.run "tensorflow.keras", as: 'keras' do
15
+ RbbtPython.run "tensorflow" do
16
+ RbbtPython.module_eval(&block)
17
+ end
18
+ end
19
+ end
20
+
21
+ def initialize(dir, graph = nil, epochs = 3, **compile_options)
22
+ @graph = graph
23
+ @epochs = epochs
24
+ @compile_options = compile_options
25
+
26
+ super(dir)
27
+
28
+ @train_model = Proc.new do |file, features, labels|
29
+ tensorflow do
30
+ features = tensorflow.convert_to_tensor(features)
31
+ labels = tensorflow.convert_to_tensor(labels)
32
+ end
33
+ @graph ||= keras_graph
34
+ @graph.compile(**@compile_options)
35
+ @graph.fit(features, labels, :epochs => @epochs, :verbose => false)
36
+ @graph.save(file)
37
+ end
38
+
39
+ @eval_model = Proc.new do |file, features|
40
+ tensorflow do
41
+ features = tensorflow.convert_to_tensor(features)
42
+ end
43
+ keras do
44
+ @graph ||= keras.models.load_model(file)
45
+ indices = @graph.predict(features, :verbose => false).tolist()
46
+ labels = indices.collect{|p| p.length > 1 ? p.index(p.max): p.first }
47
+ labels
48
+ end
49
+ end
50
+ end
51
+
52
+ def keras_graph(&block)
53
+ @graph = keras(&block)
54
+ end
55
+ end
@@ -11,7 +11,7 @@ class TestBarcode < Test::Unit::TestCase
11
11
  data["G4"] = [6,6,1,1,1,1]
12
12
 
13
13
  TmpFile.with_file(data.to_s) do |file|
14
- m = Matrix.new file
14
+ m = RbbtMatrix.new file
15
15
  m.barcode(file+'.barcode')
16
16
  tsv = TSV.open(file+'.barcode')
17
17
  assert tsv["G2"] = [0,1,0,1,0,1]
@@ -20,7 +20,7 @@ N4 N5
20
20
  end_node = "N5"
21
21
 
22
22
  path = Paths.dijkstra(network, start_node, [end_node])
23
- assert_equal %w(N1 N2 N4), path.reverse
23
+ assert_equal %w(N1 N2 N4 N5), path.reverse
24
24
  end
25
25
 
26
26
  def test_weighted_dijsktra
@@ -0,0 +1,121 @@
1
+ require File.join(File.expand_path(File.dirname(__FILE__)), '../../..', 'test_helper.rb')
2
+ require 'rbbt/vector/model/spaCy'
3
+ require 'rbbt/vector/model/svm'
4
+
5
+ class TestSpaCyModel < Test::Unit::TestCase
6
+
7
+ def test_spyCy
8
+ TmpFile.with_file() do |dir|
9
+ Log.severity = 0
10
+ FileUtils.mkdir_p dir
11
+
12
+ model = SpaCyModel.new(
13
+ dir,
14
+ "cpu/textcat_efficiency.conf"
15
+ )
16
+
17
+
18
+ require 'rbbt/tsv/csv'
19
+ url = "https://raw.githubusercontent.com/hanzhang0420/Women-Clothing-E-commerce/master/Womens%20Clothing%20E-Commerce%20Reviews.csv"
20
+ tsv = TSV.csv(Open.open(url))
21
+ tsv = tsv.reorder("Review Text", ["Recommended IND"]).to_single
22
+
23
+ good = tsv.select("Recommended IND" => '1')
24
+ bad = tsv.select("Recommended IND" => '0')
25
+
26
+ gsize = 2000
27
+ bsize = 500
28
+ good.keys[0..gsize-1].each do |text|
29
+ next if text.nil? || text.empty?
30
+ model.add text, '1'
31
+ end
32
+
33
+ bad.keys[0..bsize-1].each do |text|
34
+ model.add text, '0'
35
+ end
36
+
37
+ model.cross_validation
38
+ end
39
+
40
+ def test_svm_spacy
41
+
42
+ require 'rbbt/tsv/csv'
43
+ url = "https://raw.githubusercontent.com/hanzhang0420/Women-Clothing-E-commerce/master/Womens%20Clothing%20E-Commerce%20Reviews.csv"
44
+ tsv = TSV.csv(Open.open(url))
45
+ tsv = tsv.reorder("Review Text", ["Recommended IND"]).to_single
46
+
47
+ good = tsv.select("Recommended IND" => '1')
48
+ bad = tsv.select("Recommended IND" => '0')
49
+
50
+ gsize = 2000
51
+ bsize = 500
52
+ model = SVMModel.new(
53
+ dir
54
+ )
55
+
56
+ nlp = RbbtPython.run "spacy" do
57
+ spacy.load('en_core_web_md')
58
+ end
59
+
60
+ model.extract_features = Proc.new do |text|
61
+ vs = RbbtPython.run do
62
+ RbbtPython.collect nlp.(text).__iter__ do |token|
63
+ token.vector.tolist()
64
+ end
65
+ end
66
+ length = vs.length
67
+
68
+ v = vs.inject(nil){|acc,ev| acc = acc.nil? ? ev : acc.zip(ev).collect{|a,b| a + b } }
69
+
70
+ v.collect{|e| e / length }
71
+ end
72
+
73
+ TSV.traverse good.keys[0..gsize-1], :type => :array, :bar => true do |text|
74
+ next if text.nil? || text.empty?
75
+ model.add text, '1'
76
+ end
77
+
78
+ TSV.traverse bad.keys[0..bsize-1], :type => :array, :bar => true do |text|
79
+ model.add text, '0'
80
+ end
81
+
82
+ model.cross_validation
83
+
84
+ end
85
+ end
86
+
87
+ def test_spyCy_trf
88
+ TmpFile.with_file() do |dir|
89
+ Log.severity = 0
90
+ FileUtils.mkdir_p dir
91
+
92
+ model = SpaCyModel.new(
93
+ dir,
94
+ "gpu/textcat_accuracy.conf"
95
+ )
96
+
97
+
98
+ require 'rbbt/tsv/csv'
99
+ url = "https://raw.githubusercontent.com/hanzhang0420/Women-Clothing-E-commerce/master/Womens%20Clothing%20E-Commerce%20Reviews.csv"
100
+ tsv = TSV.csv(Open.open(url))
101
+ tsv = tsv.reorder("Review Text", ["Recommended IND"]).to_single
102
+
103
+ good = tsv.select("Recommended IND" => '1')
104
+ bad = tsv.select("Recommended IND" => '0')
105
+
106
+ gsize = 2000
107
+ bsize = 500
108
+ good.keys[0..gsize-1].each do |text|
109
+ next if text.nil? || text.empty?
110
+ model.add text, '1'
111
+ end
112
+
113
+ bad.keys[0..bsize-1].each do |text|
114
+ model.add text, '0'
115
+ end
116
+
117
+ model.cross_validation
118
+ end
119
+ end
120
+ end
121
+
@@ -0,0 +1,57 @@
1
+ require File.join(File.expand_path(File.dirname(__FILE__)), '../../..', 'test_helper.rb')
2
+ require 'rbbt/vector/model/tensorflow'
3
+
4
+ class TestTensorflowModel < Test::Unit::TestCase
5
+
6
+ def test_keras
7
+ TmpFile.with_file() do |dir|
8
+ FileUtils.mkdir_p dir
9
+
10
+ model = TensorFlowModel.new(
11
+ dir,
12
+ optimizer:'adam',
13
+ loss: 'sparse_categorical_crossentropy',
14
+ metrics: ['accuracy']
15
+ )
16
+
17
+ model.keras_graph do
18
+ tf = tensorflow
19
+ tf.keras.models.Sequential.new([
20
+ tf.keras.layers.Flatten.new(input_shape: [28, 28]),
21
+ tf.keras.layers.Dense.new(128, activation:'relu'),
22
+ tf.keras.layers.Dropout.new(0.2),
23
+ tf.keras.layers.Dense.new(10, activation:'softmax')
24
+ ])
25
+ end
26
+
27
+ sum = predictions = nil
28
+ model.tensorflow do
29
+ tf = tensorflow
30
+ mnist_db = tf.keras.datasets.mnist
31
+
32
+ (x_train, y_train), (x_test, y_test) = mnist_db.load_data()
33
+ x_train, x_test = x_train / 255.0, x_test / 255.0
34
+
35
+ num = PyCall.len(x_train)
36
+
37
+ num.times do |i|
38
+ model.add x_train[i], y_train[i]
39
+ end
40
+
41
+ model.train
42
+
43
+ predictions = model.eval_list x_test.tolist()
44
+ sum = 0
45
+ predictions.zip(y_test.tolist()).each do |pred,label|
46
+ sum += 1 if label.to_i == pred
47
+ end
48
+
49
+ end
50
+
51
+ assert sum.to_f / predictions.length > 0.7
52
+
53
+
54
+ end
55
+ end
56
+ end
57
+
@@ -71,4 +71,358 @@ cat(label, file="#{results}");
71
71
  end
72
72
  end
73
73
 
74
+ def test_model_list
75
+ text =<<-EOF
76
+ 1 0;1;1
77
+ 1 1;0;1
78
+ 1 1;1;1
79
+ 1 0;1;1
80
+ 1 1;1;1
81
+ 0 0;1;0
82
+ 0 1;0;0
83
+ 0 0;1;0
84
+ 0 1;0;0
85
+ EOF
86
+
87
+ TmpFile.with_file() do |dir|
88
+ FileUtils.mkdir_p dir
89
+ model = VectorModel.new(dir)
90
+
91
+ model.extract_features = Proc.new{|element,list|
92
+ if element
93
+ element.split(";")
94
+ elsif list
95
+ list.collect{|e| e.split(";") }
96
+ end
97
+ }
98
+
99
+ model.train_model = Proc.new{|model_file,features,labels|
100
+ TmpFile.with_file do |feature_file|
101
+ Open.write(feature_file, features.collect{|feats| feats * "\t"} * "\n")
102
+ Open.write(feature_file + '.class', labels * "\n")
103
+ R.run <<-EOF
104
+ features = read.table("#{ feature_file }", sep ="\\t", stringsAsFactors=FALSE);
105
+ labels = scan("#{ feature_file }.class", what=numeric());
106
+ features = cbind(features, class = labels);
107
+ rbbt.require('e1071')
108
+ model = svm(class ~ ., data = features)
109
+ save(model, file="#{ model_file }");
110
+ EOF
111
+ end
112
+ }
113
+
114
+ model.eval_model = Proc.new{|model_file,features|
115
+ TmpFile.with_file do |feature_file|
116
+ TmpFile.with_file do |results|
117
+ Open.write(feature_file, features * "\t")
118
+ puts R.run(<<-EOF
119
+ features = read.table("#{ feature_file }", sep ="\\t", stringsAsFactors=FALSE);
120
+ library(e1071)
121
+ load(file="#{ model_file }")
122
+ label = predict(model, features);
123
+ cat(label, file="#{results}");
124
+ EOF
125
+ ).read
126
+ Open.read(results)
127
+ end
128
+ end
129
+
130
+ }
131
+
132
+ pairs = text.split(/\n/).collect do |line|
133
+ label, features = line.split(" ")
134
+ [features, label]
135
+ end
136
+
137
+ model.add_list(*Misc.zip_fields(pairs))
138
+
139
+ model.train
140
+
141
+ assert model.eval("1;1;1").to_f > 0.5
142
+ assert model.eval("0;0;0").to_f < 0.5
143
+ end
144
+ end
145
+
146
+ def test_model_list2
147
+ text =<<-EOF
148
+ 1 0;1;1
149
+ 1 1;0;1
150
+ 1 1;1;1
151
+ 1 0;1;1
152
+ 1 1;1;1
153
+ 0 0;1;0
154
+ 0 1;0;0
155
+ 0 0;1;0
156
+ 0 1;0;0
157
+ EOF
158
+
159
+ TmpFile.with_file() do |dir|
160
+ FileUtils.mkdir_p dir
161
+ model = VectorModel.new(dir)
162
+
163
+ model.extract_features = Proc.new{|element|
164
+ element.split(";")
165
+ }
166
+
167
+ model.train_model = Proc.new{|model_file,features,labels|
168
+ TmpFile.with_file do |feature_file|
169
+ Open.write(feature_file, features.collect{|feats| feats * "\t"} * "\n")
170
+ Open.write(feature_file + '.class', labels * "\n")
171
+ R.run <<-EOF
172
+ features = read.table("#{ feature_file }", sep ="\\t", stringsAsFactors=FALSE);
173
+ labels = scan("#{ feature_file }.class", what=numeric());
174
+ features = cbind(features, class = labels);
175
+ rbbt.require('e1071')
176
+ model = svm(class ~ ., data = features)
177
+ save(model, file="#{ model_file }");
178
+ EOF
179
+ end
180
+ }
181
+
182
+ model.eval_model = Proc.new{|model_file,features|
183
+ TmpFile.with_file do |feature_file|
184
+ TmpFile.with_file do |results|
185
+ Open.write(feature_file, features * "\t")
186
+ puts R.run(<<-EOF
187
+ features = read.table("#{ feature_file }", sep ="\\t", stringsAsFactors=FALSE);
188
+ library(e1071)
189
+ load(file="#{ model_file }")
190
+ label = predict(model, features);
191
+ cat(label, file="#{results}");
192
+ EOF
193
+ ).read
194
+ Open.read(results)
195
+ end
196
+ end
197
+
198
+ }
199
+
200
+ pairs = text.split(/\n/).collect do |line|
201
+ label, features = line.split(" ")
202
+ [features, label]
203
+ end
204
+
205
+ model.add_list(*Misc.zip_fields(pairs))
206
+
207
+ model.train
208
+
209
+ assert model.eval("1;1;1").to_f > 0.5
210
+ assert model.eval("0;0;0").to_f < 0.5
211
+ end
212
+ end
213
+
214
+ def test_model_list
215
+ text =<<-EOF
216
+ 1 0;1;1
217
+ 1 1;0;1
218
+ 1 1;1;1
219
+ 1 0;1;1
220
+ 1 1;1;1
221
+ 0 0;1;0
222
+ 0 1;0;0
223
+ 0 0;1;0
224
+ 0 1;0;0
225
+ EOF
226
+
227
+ TmpFile.with_file() do |dir|
228
+ FileUtils.mkdir_p dir
229
+ model = VectorModel.new(dir)
230
+
231
+ model.extract_features = Proc.new{|element,list|
232
+ if element
233
+ element.split(";")
234
+ elsif list
235
+ list.collect{|e| e.split(";") }
236
+ end
237
+ }
238
+
239
+ model.train_model = Proc.new{|model_file,features,labels|
240
+ TmpFile.with_file do |feature_file|
241
+ Open.write(feature_file, features.collect{|feats| feats * "\t"} * "\n")
242
+ Open.write(feature_file + '.class', labels * "\n")
243
+ R.run <<-EOF
244
+ features = read.table("#{ feature_file }", sep ="\\t", stringsAsFactors=FALSE);
245
+ labels = scan("#{ feature_file }.class", what=numeric());
246
+ features = cbind(features, class = labels);
247
+ rbbt.require('e1071')
248
+ model = svm(class ~ ., data = features)
249
+ save(model, file="#{ model_file }");
250
+ EOF
251
+ end
252
+ }
253
+
254
+ model.eval_model = Proc.new{|model_file,features|
255
+ TmpFile.with_file do |feature_file|
256
+ TmpFile.with_file do |results|
257
+ Open.write(feature_file, features * "\t")
258
+ puts R.run(<<-EOF
259
+ features = read.table("#{ feature_file }", sep ="\\t", stringsAsFactors=FALSE);
260
+ library(e1071)
261
+ load(file="#{ model_file }")
262
+ label = predict(model, features);
263
+ cat(label, file="#{results}");
264
+ EOF
265
+ ).read
266
+ Open.read(results)
267
+ end
268
+ end
269
+
270
+ }
271
+
272
+ pairs = text.split(/\n/).collect do |line|
273
+ label, features = line.split(" ")
274
+ model.add features, label
275
+ end
276
+
277
+ model.train
278
+
279
+ assert model.eval("1;1;1").to_f > 0.5
280
+ assert model.eval("0;0;0").to_f < 0.5
281
+ end
282
+ end
283
+
284
+ def test_model_save
285
+ text =<<-EOF
286
+ 1 0;1;1
287
+ 1 1;0;1
288
+ 1 1;1;1
289
+ 1 0;1;1
290
+ 1 1;1;1
291
+ 0 0;1;0
292
+ 0 1;0;0
293
+ 0 0;1;0
294
+ 0 1;0;0
295
+ EOF
296
+
297
+ TmpFile.with_file() do |dir|
298
+ FileUtils.mkdir_p dir
299
+ model = VectorModel.new(dir)
300
+
301
+ model.extract_features = Proc.new{|element|
302
+ element.split(";")
303
+ }
304
+
305
+ model.train_model = Proc.new{|model_file,features,labels|
306
+ TmpFile.with_file do |feature_file|
307
+ Open.write(feature_file, features.collect{|feats| feats * "\t"} * "\n")
308
+ Open.write(feature_file + '.class', labels * "\n")
309
+ R.run <<-EOF
310
+ features = read.table("#{ feature_file }", sep ="\\t", stringsAsFactors=FALSE);
311
+ labels = scan("#{ feature_file }.class", what=numeric());
312
+ features = cbind(features, class = labels);
313
+ rbbt.require('e1071')
314
+ model = svm(class ~ ., data = features)
315
+ save(model, file="#{ model_file }");
316
+ EOF
317
+ end
318
+ }
319
+
320
+ model.eval_model = Proc.new{|model_file,features|
321
+ TmpFile.with_file do |feature_file|
322
+ TmpFile.with_file do |results|
323
+ Open.write(feature_file, features * "\t")
324
+ puts R.run(<<-EOF
325
+ features = read.table("#{ feature_file }", sep ="\\t", stringsAsFactors=FALSE);
326
+ library(e1071)
327
+ load(file="#{ model_file }")
328
+ label = predict(model, features);
329
+ cat(label, file="#{results}");
330
+ EOF
331
+ ).read
332
+ Open.read(results)
333
+ end
334
+ end
335
+
336
+ }
337
+
338
+ pairs = text.split(/\n/).collect do |line|
339
+ label, features = line.split(" ")
340
+ [features, label]
341
+ end
342
+
343
+ model.add_list(*Misc.zip_fields(pairs))
344
+
345
+ model.train
346
+
347
+ assert model.eval("1;1;1").to_f > 0.5
348
+ assert model.eval("0;0;0").to_f < 0.5
349
+ end
350
+ end
351
+
352
+ def test_model_save
353
+ text =<<-EOF
354
+ 1 0;1;1
355
+ 1 1;0;1
356
+ 1 1;1;1
357
+ 1 0;1;1
358
+ 1 1;1;1
359
+ 0 0;1;0
360
+ 0 1;0;0
361
+ 0 0;1;0
362
+ 0 1;0;0
363
+ EOF
364
+
365
+ TmpFile.with_file() do |dir|
366
+ FileUtils.mkdir_p dir
367
+ model = VectorModel.new(dir)
368
+
369
+ model.extract_features = Proc.new{|element,list|
370
+ if element
371
+ element.split(";")
372
+ elsif list
373
+ list.collect{|e| e.split(";") }
374
+ end
375
+ }
376
+
377
+ model.train_model = Proc.new{|model_file,features,labels|
378
+ TmpFile.with_file do |feature_file|
379
+ Open.write(feature_file, features.collect{|feats| feats * "\t"} * "\n")
380
+ Open.write(feature_file + '.class', labels * "\n")
381
+ R.run <<-EOF
382
+ features = read.table("#{ feature_file }", sep ="\\t", stringsAsFactors=FALSE);
383
+ labels = scan("#{ feature_file }.class", what=numeric());
384
+ features = cbind(features, class = labels);
385
+ rbbt.require('e1071')
386
+ model = svm(class ~ ., data = features)
387
+ save(model, file="#{ model_file }");
388
+ EOF
389
+ end
390
+ }
391
+
392
+ model.eval_model = Proc.new{|model_file,features|
393
+ TmpFile.with_file do |feature_file|
394
+ TmpFile.with_file do |results|
395
+ Open.write(feature_file, features * "\t")
396
+ puts R.run(<<-EOF
397
+ features = read.table("#{ feature_file }", sep ="\\t", stringsAsFactors=FALSE);
398
+ library(e1071)
399
+ load(file="#{ model_file }")
400
+ label = predict(model, features);
401
+ cat(label, file="#{results}");
402
+ EOF
403
+ ).read
404
+ Open.read(results)
405
+ end
406
+ end
407
+
408
+ }
409
+
410
+ pairs = text.split(/\n/).collect do |line|
411
+ label, features = line.split(" ")
412
+ model.add features, label
413
+ end
414
+
415
+ model.train
416
+
417
+ model = VectorModel.new(dir)
418
+ pairs = text.split(/\n/).collect do |line|
419
+ label, features = line.split(" ")
420
+ model.add features, label
421
+ end
422
+
423
+ assert model.eval("1;1;1").to_f > 0.5
424
+ assert model.eval("0;0;0").to_f < 0.5
425
+ end
426
+ end
427
+
74
428
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: rbbt-dm
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.1.46
4
+ version: 1.1.51
5
5
  platform: ruby
6
6
  authors:
7
7
  - Miguel Vazquez
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2020-01-30 00:00:00.000000000 Z
11
+ date: 2021-06-23 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: rbbt-util
@@ -39,7 +39,7 @@ dependencies:
39
39
  - !ruby/object:Gem::Version
40
40
  version: '0'
41
41
  - !ruby/object:Gem::Dependency
42
- name: priority_queue_cxx
42
+ name: priority_queue_cxx17
43
43
  requirement: !ruby/object:Gem::Requirement
44
44
  requirements:
45
45
  - - ">="
@@ -105,8 +105,11 @@ files:
105
105
  - lib/rbbt/statistics/hypergeometric.rb
106
106
  - lib/rbbt/statistics/random_walk.rb
107
107
  - lib/rbbt/statistics/rank_product.rb
108
+ - lib/rbbt/tensorflow.rb
108
109
  - lib/rbbt/vector/model.rb
110
+ - lib/rbbt/vector/model/spaCy.rb
109
111
  - lib/rbbt/vector/model/svm.rb
112
+ - lib/rbbt/vector/model/tensorflow.rb
110
113
  - share/R/MA.R
111
114
  - share/R/barcode.R
112
115
  - share/R/heatmap.3.R
@@ -118,7 +121,9 @@ files:
118
121
  - test/rbbt/statistics/test_random_walk.rb
119
122
  - test/rbbt/test_ml_task.rb
120
123
  - test/rbbt/test_stan.rb
124
+ - test/rbbt/vector/model/test_spaCy.rb
121
125
  - test/rbbt/vector/model/test_svm.rb
126
+ - test/rbbt/vector/model/test_tensorflow.rb
122
127
  - test/rbbt/vector/test_model.rb
123
128
  - test/test_helper.rb
124
129
  homepage: http://github.com/mikisvaz/rbbt-phgx
@@ -139,7 +144,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
139
144
  - !ruby/object:Gem::Version
140
145
  version: '0'
141
146
  requirements: []
142
- rubygems_version: 3.0.6
147
+ rubygems_version: 3.1.4
143
148
  signing_key:
144
149
  specification_version: 4
145
150
  summary: Data-mining and statistics
@@ -152,6 +157,8 @@ test_files:
152
157
  - test/rbbt/statistics/test_hypergeometric.rb
153
158
  - test/rbbt/test_ml_task.rb
154
159
  - test/rbbt/vector/test_model.rb
160
+ - test/rbbt/vector/model/test_spaCy.rb
161
+ - test/rbbt/vector/model/test_tensorflow.rb
155
162
  - test/rbbt/vector/model/test_svm.rb
156
163
  - test/rbbt/test_stan.rb
157
164
  - test/test_helper.rb