rbbt-dm 1.1.47 → 1.1.52

Sign up to get free protection for your applications and to get access to all the features.
@@ -9,7 +9,7 @@ class SVMModel < VectorModel
9
9
 
10
10
  @train_model =<<-EOF
11
11
  library(e1071);
12
- model = svm(class ~ ., data = features, scale=c(0));
12
+ model = svm(as.factor(class) ~ ., data = features);
13
13
  EOF
14
14
 
15
15
  @eval_model =<<-EOF
@@ -0,0 +1,55 @@
1
+ require 'rbbt/vector/model'
2
+ require 'rbbt/tensorflow'
3
+
4
+ class TensorFlowModel < VectorModel
5
+ attr_accessor :graph, :epochs, :compile_options
6
+
7
+ def tensorflow(&block)
8
+ RbbtPython.run "tensorflow" do
9
+ RbbtPython.module_eval(&block)
10
+ end
11
+ end
12
+
13
+ def keras(&block)
14
+ RbbtPython.run "tensorflow.keras", as: 'keras' do
15
+ RbbtPython.run "tensorflow" do
16
+ RbbtPython.module_eval(&block)
17
+ end
18
+ end
19
+ end
20
+
21
+ def initialize(dir, graph = nil, epochs = 3, **compile_options)
22
+ @graph = graph
23
+ @epochs = epochs
24
+ @compile_options = compile_options
25
+
26
+ super(dir)
27
+
28
+ @train_model = Proc.new do |file, features, labels|
29
+ tensorflow do
30
+ features = tensorflow.convert_to_tensor(features)
31
+ labels = tensorflow.convert_to_tensor(labels)
32
+ end
33
+ @graph ||= keras_graph
34
+ @graph.compile(**@compile_options)
35
+ @graph.fit(features, labels, :epochs => @epochs, :verbose => false)
36
+ @graph.save(file)
37
+ end
38
+
39
+ @eval_model = Proc.new do |file, features|
40
+ tensorflow do
41
+ features = tensorflow.convert_to_tensor(features)
42
+ end
43
+ keras do
44
+ @graph ||= keras.models.load_model(file)
45
+ indices = @graph.predict(features, :verbose => false).tolist()
46
+ labels = indices.collect{|p| p.length > 1 ? p.index(p.max): p.first }
47
+ labels
48
+ end
49
+ end
50
+ end
51
+
52
+ def keras_graph(&block)
53
+ @graph = keras(&block)
54
+ end
55
+ end
@@ -0,0 +1,86 @@
1
+ # This is an auto-generated partial config. To use it with 'spacy train'
2
+ # you can run spacy init fill-config to auto-fill all default settings:
3
+ # python -m spacy init fill-config ./base_config.cfg ./config.cfg
4
+ [paths]
5
+ train = null
6
+ dev = null
7
+
8
+ [system]
9
+ gpu_allocator = null
10
+
11
+ [nlp]
12
+ lang = "en"
13
+ pipeline = ["tok2vec","textcat"]
14
+ batch_size = 1000
15
+
16
+ [components]
17
+
18
+ [components.tok2vec]
19
+ factory = "tok2vec"
20
+
21
+ [components.tok2vec.model]
22
+ @architectures = "spacy.Tok2Vec.v2"
23
+
24
+ [components.tok2vec.model.embed]
25
+ @architectures = "spacy.MultiHashEmbed.v2"
26
+ width = ${components.tok2vec.model.encode.width}
27
+ attrs = ["ORTH", "SHAPE"]
28
+ rows = [5000, 2500]
29
+ include_static_vectors = true
30
+
31
+ [components.tok2vec.model.encode]
32
+ @architectures = "spacy.MaxoutWindowEncoder.v2"
33
+ width = 256
34
+ depth = 8
35
+ window_size = 1
36
+ maxout_pieces = 3
37
+
38
+ [components.textcat]
39
+ factory = "textcat"
40
+
41
+ [components.textcat.model]
42
+ @architectures = "spacy.TextCatEnsemble.v2"
43
+ nO = null
44
+
45
+ [components.textcat.model.tok2vec]
46
+ @architectures = "spacy.Tok2VecListener.v1"
47
+ width = ${components.tok2vec.model.encode.width}
48
+
49
+ [components.textcat.model.linear_model]
50
+ @architectures = "spacy.TextCatBOW.v1"
51
+ exclusive_classes = true
52
+ ngram_size = 1
53
+ no_output_layer = false
54
+
55
+ [corpora]
56
+
57
+ [corpora.train]
58
+ @readers = "spacy.Corpus.v1"
59
+ path = ${paths.train}
60
+ max_length = 2000
61
+
62
+ [corpora.dev]
63
+ @readers = "spacy.Corpus.v1"
64
+ path = ${paths.dev}
65
+ max_length = 0
66
+
67
+ [training]
68
+ dev_corpus = "corpora.dev"
69
+ train_corpus = "corpora.train"
70
+
71
+ [training.optimizer]
72
+ @optimizers = "Adam.v1"
73
+
74
+ [training.batcher]
75
+ @batchers = "spacy.batch_by_words.v1"
76
+ discard_oversize = false
77
+ tolerance = 0.2
78
+
79
+ [training.batcher.size]
80
+ @schedules = "compounding.v1"
81
+ start = 100
82
+ stop = 1000
83
+ compound = 1.001
84
+
85
+ [initialize]
86
+ vectors = "en_core_web_lg"
@@ -0,0 +1,78 @@
1
+ # This is an auto-generated partial config. To use it with 'spacy train'
2
+ # you can run spacy init fill-config to auto-fill all default settings:
3
+ # python -m spacy init fill-config ./base_config.cfg ./config.cfg
4
+ [paths]
5
+ train = null
6
+ dev = null
7
+
8
+ [system]
9
+ gpu_allocator = null
10
+
11
+ [nlp]
12
+ lang = "en"
13
+ pipeline = ["tok2vec","textcat"]
14
+ batch_size = 1000
15
+
16
+ [components]
17
+
18
+ [components.tok2vec]
19
+ factory = "tok2vec"
20
+
21
+ [components.tok2vec.model]
22
+ @architectures = "spacy.Tok2Vec.v2"
23
+
24
+ [components.tok2vec.model.embed]
25
+ @architectures = "spacy.MultiHashEmbed.v2"
26
+ width = ${components.tok2vec.model.encode.width}
27
+ attrs = ["ORTH", "SHAPE"]
28
+ rows = [5000, 2500]
29
+ include_static_vectors = false
30
+
31
+ [components.tok2vec.model.encode]
32
+ @architectures = "spacy.MaxoutWindowEncoder.v2"
33
+ width = 96
34
+ depth = 4
35
+ window_size = 1
36
+ maxout_pieces = 3
37
+
38
+ [components.textcat]
39
+ factory = "textcat"
40
+
41
+ [components.textcat.model]
42
+ @architectures = "spacy.TextCatBOW.v1"
43
+ exclusive_classes = true
44
+ ngram_size = 1
45
+ no_output_layer = false
46
+
47
+ [corpora]
48
+
49
+ [corpora.train]
50
+ @readers = "spacy.Corpus.v1"
51
+ path = ${paths.train}
52
+ max_length = 2000
53
+
54
+ [corpora.dev]
55
+ @readers = "spacy.Corpus.v1"
56
+ path = ${paths.dev}
57
+ max_length = 0
58
+
59
+ [training]
60
+ dev_corpus = "corpora.dev"
61
+ train_corpus = "corpora.train"
62
+
63
+ [training.optimizer]
64
+ @optimizers = "Adam.v1"
65
+
66
+ [training.batcher]
67
+ @batchers = "spacy.batch_by_words.v1"
68
+ discard_oversize = false
69
+ tolerance = 0.2
70
+
71
+ [training.batcher.size]
72
+ @schedules = "compounding.v1"
73
+ start = 100
74
+ stop = 1000
75
+ compound = 1.001
76
+
77
+ [initialize]
78
+ vectors = null
@@ -0,0 +1,84 @@
1
+ # This is an auto-generated partial config. To use it with 'spacy train'
2
+ # you can run spacy init fill-config to auto-fill all default settings:
3
+ # python -m spacy init fill-config ./base_config.cfg ./config.cfg
4
+ [paths]
5
+ train = null
6
+ dev = null
7
+
8
+ [system]
9
+ gpu_allocator = "pytorch"
10
+
11
+ [nlp]
12
+ lang = "en"
13
+ pipeline = ["transformer","textcat"]
14
+ batch_size = 128
15
+
16
+ [components]
17
+
18
+ [components.transformer]
19
+ factory = "transformer"
20
+
21
+ [components.transformer.model]
22
+ @architectures = "spacy-transformers.TransformerModel.v1"
23
+ name = "roberta-base"
24
+ tokenizer_config = {"use_fast": true}
25
+
26
+ [components.transformer.model.get_spans]
27
+ @span_getters = "spacy-transformers.strided_spans.v1"
28
+ window = 128
29
+ stride = 96
30
+
31
+ [components.textcat]
32
+ factory = "textcat"
33
+
34
+ [components.textcat.model]
35
+ @architectures = "spacy.TextCatEnsemble.v2"
36
+ nO = null
37
+
38
+ [components.textcat.model.tok2vec]
39
+ @architectures = "spacy-transformers.TransformerListener.v1"
40
+ grad_factor = 1.0
41
+
42
+ [components.textcat.model.tok2vec.pooling]
43
+ @layers = "reduce_mean.v1"
44
+
45
+ [components.textcat.model.linear_model]
46
+ @architectures = "spacy.TextCatBOW.v1"
47
+ exclusive_classes = true
48
+ ngram_size = 1
49
+ no_output_layer = false
50
+
51
+ [corpora]
52
+
53
+ [corpora.train]
54
+ @readers = "spacy.Corpus.v1"
55
+ path = ${paths.train}
56
+ max_length = 500
57
+
58
+ [corpora.dev]
59
+ @readers = "spacy.Corpus.v1"
60
+ path = ${paths.dev}
61
+ max_length = 0
62
+
63
+ [training]
64
+ accumulate_gradient = 3
65
+ dev_corpus = "corpora.dev"
66
+ train_corpus = "corpora.train"
67
+
68
+ [training.optimizer]
69
+ @optimizers = "Adam.v1"
70
+
71
+ [training.optimizer.learn_rate]
72
+ @schedules = "warmup_linear.v1"
73
+ warmup_steps = 250
74
+ total_steps = 20000
75
+ initial_rate = 5e-5
76
+
77
+ [training.batcher]
78
+ @batchers = "spacy.batch_by_padded.v1"
79
+ discard_oversize = true
80
+ size = 2000
81
+ buffer = 256
82
+
83
+ [initialize]
84
+ vectors = null
@@ -0,0 +1,73 @@
1
+ # This is an auto-generated partial config. To use it with 'spacy train'
2
+ # you can run spacy init fill-config to auto-fill all default settings:
3
+ # python -m spacy init fill-config ./base_config.cfg ./config.cfg
4
+ [paths]
5
+ train = null
6
+ dev = null
7
+
8
+ [system]
9
+ gpu_allocator = "pytorch"
10
+
11
+ [nlp]
12
+ lang = "en"
13
+ pipeline = ["transformer","textcat"]
14
+ batch_size = 128
15
+
16
+ [components]
17
+
18
+ [components.transformer]
19
+ factory = "transformer"
20
+
21
+ [components.transformer.model]
22
+ @architectures = "spacy-transformers.TransformerModel.v1"
23
+ name = "roberta-base"
24
+ tokenizer_config = {"use_fast": true}
25
+
26
+ [components.transformer.model.get_spans]
27
+ @span_getters = "spacy-transformers.strided_spans.v1"
28
+ window = 128
29
+ stride = 96
30
+
31
+ [components.textcat]
32
+ factory = "textcat"
33
+
34
+ [components.textcat.model]
35
+ @architectures = "spacy.TextCatBOW.v1"
36
+ exclusive_classes = true
37
+ ngram_size = 1
38
+ no_output_layer = false
39
+
40
+ [corpora]
41
+
42
+ [corpora.train]
43
+ @readers = "spacy.Corpus.v1"
44
+ path = ${paths.train}
45
+ max_length = 500
46
+
47
+ [corpora.dev]
48
+ @readers = "spacy.Corpus.v1"
49
+ path = ${paths.dev}
50
+ max_length = 0
51
+
52
+ [training]
53
+ accumulate_gradient = 3
54
+ dev_corpus = "corpora.dev"
55
+ train_corpus = "corpora.train"
56
+
57
+ [training.optimizer]
58
+ @optimizers = "Adam.v1"
59
+
60
+ [training.optimizer.learn_rate]
61
+ @schedules = "warmup_linear.v1"
62
+ warmup_steps = 250
63
+ total_steps = 20000
64
+ initial_rate = 5e-5
65
+
66
+ [training.batcher]
67
+ @batchers = "spacy.batch_by_padded.v1"
68
+ discard_oversize = true
69
+ size = 2000
70
+ buffer = 256
71
+
72
+ [initialize]
73
+ vectors = null
@@ -11,7 +11,7 @@ class TestBarcode < Test::Unit::TestCase
11
11
  data["G4"] = [6,6,1,1,1,1]
12
12
 
13
13
  TmpFile.with_file(data.to_s) do |file|
14
- m = Matrix.new file
14
+ m = RbbtMatrix.new file
15
15
  m.barcode(file+'.barcode')
16
16
  tsv = TSV.open(file+'.barcode')
17
17
  assert tsv["G2"] = [0,1,0,1,0,1]
@@ -20,7 +20,7 @@ N4 N5
20
20
  end_node = "N5"
21
21
 
22
22
  path = Paths.dijkstra(network, start_node, [end_node])
23
- assert_equal %w(N1 N2 N4), path.reverse
23
+ assert_equal %w(N1 N2 N4 N5), path.reverse
24
24
  end
25
25
 
26
26
  def test_weighted_dijsktra
@@ -0,0 +1,121 @@
1
+ require File.join(File.expand_path(File.dirname(__FILE__)), '../../..', 'test_helper.rb')
2
+ require 'rbbt/vector/model/spaCy'
3
+ require 'rbbt/vector/model/svm'
4
+
5
+ class TestSpaCyModel < Test::Unit::TestCase
6
+
7
+ def test_spyCy
8
+ TmpFile.with_file() do |dir|
9
+ Log.severity = 0
10
+ FileUtils.mkdir_p dir
11
+
12
+ model = SpaCyModel.new(
13
+ dir,
14
+ "cpu/textcat_efficiency.conf"
15
+ )
16
+
17
+
18
+ require 'rbbt/tsv/csv'
19
+ url = "https://raw.githubusercontent.com/hanzhang0420/Women-Clothing-E-commerce/master/Womens%20Clothing%20E-Commerce%20Reviews.csv"
20
+ tsv = TSV.csv(Open.open(url))
21
+ tsv = tsv.reorder("Review Text", ["Recommended IND"]).to_single
22
+
23
+ good = tsv.select("Recommended IND" => '1')
24
+ bad = tsv.select("Recommended IND" => '0')
25
+
26
+ gsize = 2000
27
+ bsize = 500
28
+ good.keys[0..gsize-1].each do |text|
29
+ next if text.nil? || text.empty?
30
+ model.add text, '1'
31
+ end
32
+
33
+ bad.keys[0..bsize-1].each do |text|
34
+ model.add text, '0'
35
+ end
36
+
37
+ model.cross_validation
38
+ end
39
+
40
+ def test_svm_spacy
41
+
42
+ require 'rbbt/tsv/csv'
43
+ url = "https://raw.githubusercontent.com/hanzhang0420/Women-Clothing-E-commerce/master/Womens%20Clothing%20E-Commerce%20Reviews.csv"
44
+ tsv = TSV.csv(Open.open(url))
45
+ tsv = tsv.reorder("Review Text", ["Recommended IND"]).to_single
46
+
47
+ good = tsv.select("Recommended IND" => '1')
48
+ bad = tsv.select("Recommended IND" => '0')
49
+
50
+ gsize = 2000
51
+ bsize = 500
52
+ model = SVMModel.new(
53
+ dir
54
+ )
55
+
56
+ nlp = RbbtPython.run "spacy" do
57
+ spacy.load('en_core_web_md')
58
+ end
59
+
60
+ model.extract_features = Proc.new do |text|
61
+ vs = RbbtPython.run do
62
+ RbbtPython.collect nlp.(text).__iter__ do |token|
63
+ token.vector.tolist()
64
+ end
65
+ end
66
+ length = vs.length
67
+
68
+ v = vs.inject(nil){|acc,ev| acc = acc.nil? ? ev : acc.zip(ev).collect{|a,b| a + b } }
69
+
70
+ v.collect{|e| e / length }
71
+ end
72
+
73
+ TSV.traverse good.keys[0..gsize-1], :type => :array, :bar => true do |text|
74
+ next if text.nil? || text.empty?
75
+ model.add text, '1'
76
+ end
77
+
78
+ TSV.traverse bad.keys[0..bsize-1], :type => :array, :bar => true do |text|
79
+ model.add text, '0'
80
+ end
81
+
82
+ model.cross_validation
83
+
84
+ end
85
+ end
86
+
87
+ def test_spyCy_trf
88
+ TmpFile.with_file() do |dir|
89
+ Log.severity = 0
90
+ FileUtils.mkdir_p dir
91
+
92
+ model = SpaCyModel.new(
93
+ dir,
94
+ "gpu/textcat_accuracy.conf"
95
+ )
96
+
97
+
98
+ require 'rbbt/tsv/csv'
99
+ url = "https://raw.githubusercontent.com/hanzhang0420/Women-Clothing-E-commerce/master/Womens%20Clothing%20E-Commerce%20Reviews.csv"
100
+ tsv = TSV.csv(Open.open(url))
101
+ tsv = tsv.reorder("Review Text", ["Recommended IND"]).to_single
102
+
103
+ good = tsv.select("Recommended IND" => '1')
104
+ bad = tsv.select("Recommended IND" => '0')
105
+
106
+ gsize = 2000
107
+ bsize = 500
108
+ good.keys[0..gsize-1].each do |text|
109
+ next if text.nil? || text.empty?
110
+ model.add text, '1'
111
+ end
112
+
113
+ bad.keys[0..bsize-1].each do |text|
114
+ model.add text, '0'
115
+ end
116
+
117
+ model.cross_validation
118
+ end
119
+ end
120
+ end
121
+