rbbt-dm 1.2.6 → 1.2.9

Sign up to get free protection for your applications and to get access to all the features.
Files changed (34) hide show
  1. checksums.yaml +4 -4
  2. data/lib/rbbt/matrix/barcode.rb +2 -2
  3. data/lib/rbbt/matrix/differential.rb +3 -3
  4. data/lib/rbbt/matrix/knowledge_base.rb +1 -1
  5. data/lib/rbbt/plots/bar.rb +1 -1
  6. data/lib/rbbt/stan.rb +1 -1
  7. data/lib/rbbt/statistics/hypergeometric.rb +2 -1
  8. data/lib/rbbt/vector/model/huggingface/masked_lm.rb +50 -0
  9. data/lib/rbbt/vector/model/huggingface.rb +57 -38
  10. data/lib/rbbt/vector/model/pytorch_lightning.rb +35 -0
  11. data/lib/rbbt/vector/model/random_forest.rb +1 -1
  12. data/lib/rbbt/vector/model/spaCy.rb +8 -14
  13. data/lib/rbbt/vector/model/tensorflow.rb +6 -5
  14. data/lib/rbbt/vector/model/torch.rb +37 -0
  15. data/lib/rbbt/vector/model/util.rb +18 -0
  16. data/lib/rbbt/vector/model.rb +100 -56
  17. data/python/rbbt_dm/__init__.py +48 -1
  18. data/python/rbbt_dm/atcold/__init__.py +0 -0
  19. data/python/rbbt_dm/atcold/plot_lib.py +141 -0
  20. data/python/rbbt_dm/atcold/spiral.py +27 -0
  21. data/python/rbbt_dm/huggingface.py +57 -26
  22. data/python/rbbt_dm/language_model.py +70 -0
  23. data/python/rbbt_dm/util.py +30 -0
  24. data/share/spaCy/gpu/textcat_accuracy.conf +2 -1
  25. data/test/rbbt/vector/model/huggingface/test_masked_lm.rb +41 -0
  26. data/test/rbbt/vector/model/test_huggingface.rb +258 -27
  27. data/test/rbbt/vector/model/test_pytorch_lightning.rb +83 -0
  28. data/test/rbbt/vector/model/test_spaCy.rb +1 -1
  29. data/test/rbbt/vector/model/test_tensorflow.rb +3 -0
  30. data/test/rbbt/vector/test_model.rb +25 -26
  31. data/test/test_helper.rb +13 -0
  32. metadata +26 -16
  33. data/lib/rbbt/tensorflow.rb +0 -43
  34. data/lib/rbbt/vector/model/huggingface.old.rb +0 -160
data/test/test_helper.rb CHANGED
@@ -19,4 +19,17 @@ class Test::Unit::TestCase
19
19
  def datafile_test(file)
20
20
  Test::Unit::TestCase.datafile_test(file)
21
21
  end
22
+
23
+ def with_python(code, &block)
24
+ TmpFile.with_file do |dir|
25
+ pkg = "pkg#{rand(100)}"
26
+ Open.write File.join(dir, "#{pkg}/__init__.py"), code
27
+
28
+ RbbtPython.add_path dir
29
+
30
+ Misc.in_dir dir do
31
+ yield pkg
32
+ end
33
+ end
34
+ end
22
35
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: rbbt-dm
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.2.6
4
+ version: 1.2.9
5
5
  platform: ruby
6
6
  authors:
7
7
  - Miguel Vazquez
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2023-02-08 00:00:00.000000000 Z
11
+ date: 2023-08-30 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: rbbt-util
@@ -105,17 +105,23 @@ files:
105
105
  - lib/rbbt/statistics/hypergeometric.rb
106
106
  - lib/rbbt/statistics/random_walk.rb
107
107
  - lib/rbbt/statistics/rank_product.rb
108
- - lib/rbbt/tensorflow.rb
109
108
  - lib/rbbt/vector/model.rb
110
- - lib/rbbt/vector/model/huggingface.old.rb
111
109
  - lib/rbbt/vector/model/huggingface.rb
110
+ - lib/rbbt/vector/model/huggingface/masked_lm.rb
111
+ - lib/rbbt/vector/model/pytorch_lightning.rb
112
112
  - lib/rbbt/vector/model/random_forest.rb
113
113
  - lib/rbbt/vector/model/spaCy.rb
114
114
  - lib/rbbt/vector/model/svm.rb
115
115
  - lib/rbbt/vector/model/tensorflow.rb
116
+ - lib/rbbt/vector/model/torch.rb
116
117
  - lib/rbbt/vector/model/util.rb
117
118
  - python/rbbt_dm/__init__.py
119
+ - python/rbbt_dm/atcold/__init__.py
120
+ - python/rbbt_dm/atcold/plot_lib.py
121
+ - python/rbbt_dm/atcold/spiral.py
118
122
  - python/rbbt_dm/huggingface.py
123
+ - python/rbbt_dm/language_model.py
124
+ - python/rbbt_dm/util.py
119
125
  - share/R/MA.R
120
126
  - share/R/barcode.R
121
127
  - share/R/heatmap.3.R
@@ -135,7 +141,9 @@ files:
135
141
  - test/rbbt/statistics/test_random_walk.rb
136
142
  - test/rbbt/test_ml_task.rb
137
143
  - test/rbbt/test_stan.rb
144
+ - test/rbbt/vector/model/huggingface/test_masked_lm.rb
138
145
  - test/rbbt/vector/model/test_huggingface.rb
146
+ - test/rbbt/vector/model/test_pytorch_lightning.rb
139
147
  - test/rbbt/vector/model/test_spaCy.rb
140
148
  - test/rbbt/vector/model/test_svm.rb
141
149
  - test/rbbt/vector/model/test_tensorflow.rb
@@ -159,22 +167,24 @@ required_rubygems_version: !ruby/object:Gem::Requirement
159
167
  - !ruby/object:Gem::Version
160
168
  version: '0'
161
169
  requirements: []
162
- rubygems_version: 3.1.2
170
+ rubygems_version: 3.4.19
163
171
  signing_key:
164
172
  specification_version: 4
165
173
  summary: Data-mining and statistics
166
174
  test_files:
167
- - test/test_helper.rb
168
- - test/rbbt/vector/test_model.rb
169
- - test/rbbt/vector/model/test_huggingface.rb
170
- - test/rbbt/vector/model/test_tensorflow.rb
171
- - test/rbbt/vector/model/test_spaCy.rb
172
- - test/rbbt/vector/model/test_svm.rb
173
- - test/rbbt/statistics/test_random_walk.rb
174
- - test/rbbt/statistics/test_fisher.rb
175
+ - test/rbbt/matrix/test_barcode.rb
176
+ - test/rbbt/network/test_paths.rb
175
177
  - test/rbbt/statistics/test_fdr.rb
178
+ - test/rbbt/statistics/test_fisher.rb
176
179
  - test/rbbt/statistics/test_hypergeometric.rb
177
- - test/rbbt/test_stan.rb
178
- - test/rbbt/matrix/test_barcode.rb
180
+ - test/rbbt/statistics/test_random_walk.rb
179
181
  - test/rbbt/test_ml_task.rb
180
- - test/rbbt/network/test_paths.rb
182
+ - test/rbbt/test_stan.rb
183
+ - test/rbbt/vector/model/huggingface/test_masked_lm.rb
184
+ - test/rbbt/vector/model/test_huggingface.rb
185
+ - test/rbbt/vector/model/test_pytorch_lightning.rb
186
+ - test/rbbt/vector/model/test_spaCy.rb
187
+ - test/rbbt/vector/model/test_svm.rb
188
+ - test/rbbt/vector/model/test_tensorflow.rb
189
+ - test/rbbt/vector/test_model.rb
190
+ - test/test_helper.rb
@@ -1,43 +0,0 @@
1
- require 'rbbt/util/python'
2
-
3
- module RbbtTensorflow
4
-
5
- def self.init
6
- RbbtPython.run do
7
- pyimport "tensorflow", as: "tf"
8
- end
9
- end
10
-
11
- def self.test
12
-
13
- mod = x_test = y_test = nil
14
- RbbtPython.run do
15
-
16
- mnist_db = tf.keras.datasets.mnist
17
-
18
- (x_train, y_train), (x_test, y_test) = mnist_db.load_data()
19
- x_train, x_test = x_train / 255.0, x_test / 255.0
20
-
21
- mod = tf.keras.models.Sequential.new([
22
- tf.keras.layers.Flatten.new(input_shape: [28, 28]),
23
- tf.keras.layers.Dense.new(128, activation:'relu'),
24
- tf.keras.layers.Dropout.new(0.2),
25
- tf.keras.layers.Dense.new(10, activation:'softmax')
26
- ])
27
- mod.compile(optimizer='adam',
28
- loss='sparse_categorical_crossentropy',
29
- metrics=['accuracy'])
30
- mod.fit(x_train, y_train, epochs:3)
31
- mod
32
- end
33
-
34
- RbbtPython.run do
35
- mod.evaluate(x_test, y_test, verbose:2)
36
- end
37
- end
38
- end
39
-
40
- if __FILE__ == $0
41
- RbbtTensorflow.init
42
- RbbtTensorflow.test
43
- end
@@ -1,160 +0,0 @@
1
- require 'rbbt/vector/model'
2
- require 'rbbt/util/python'
3
-
4
- RbbtPython.add_path Rbbt.python.find(:lib)
5
- RbbtPython.init_rbbt
6
-
7
- class HuggingfaceModel < VectorModel
8
-
9
- attr_accessor :checkpoint, :task, :locate_tokens, :class_labels, :class_weights, :training_args
10
-
11
- def self.tsv_dataset(tsv_dataset_file, elements, labels = nil)
12
-
13
- if labels
14
- Open.write(tsv_dataset_file) do |ffile|
15
- ffile.puts ["label", "text"].flatten * "\t"
16
- elements.zip(labels).each do |element,label|
17
- ffile.puts [label, element].flatten * "\t"
18
- end
19
- end
20
- else
21
- Open.write(tsv_dataset_file) do |ffile|
22
- ffile.puts ["text"].flatten * "\t"
23
- elements.each{|element| ffile.puts element }
24
- end
25
- end
26
-
27
- tsv_dataset_file
28
- end
29
-
30
- def self.call_method(name, *args)
31
- RbbtPython.import_method("rbbt_dm.huggingface", name).call(*args)
32
- end
33
-
34
- def call_method(name, *args)
35
- HuggingfaceModel.call_method(name, *args)
36
- end
37
-
38
- #def input_tsv_file
39
- # File.join(@directory, 'dataset.tsv') if @directory
40
- #end
41
-
42
- #def checkpoint_dir
43
- # File.join(@directory, 'checkpoints') if @directory
44
- #end
45
-
46
- def self.run_model(model, tokenizer, elements, labels = nil, training_args = {}, class_weights = nil)
47
- TmpFile.with_file do |tmpfile|
48
- tsv_file = File.join(tmpfile, 'dataset.tsv')
49
-
50
- if training_args
51
- training_args = training_args.dup
52
- checkpoint_dir = training_args.delete(:checkpoint_dir)
53
- end
54
-
55
- checkpoint_dir = File.join(tmpfile, 'checkpoints')
56
-
57
- Open.mkdir File.dirname(tsv_file)
58
- Open.mkdir File.dirname(checkpoint_dir)
59
-
60
- if labels
61
- training_args_obj = call_method(:training_args, checkpoint_dir, **training_args)
62
- call_method(:train_model, model, tokenizer, training_args_obj, tsv_dataset(tsv_file, elements, labels), class_weights)
63
- else
64
- locate_tokens, training_args = training_args, {}
65
- if Array === elements
66
- training_args_obj = call_method(:training_args, checkpoint_dir)
67
- call_method(:predict_model, model, tokenizer, training_args_obj, tsv_dataset(tsv_file, elements), locate_tokens)
68
- else
69
- call_method(:eval_model, model, tokenizer, [elements], locate_tokens)
70
- end
71
- end
72
- end
73
- end
74
-
75
- def init_model
76
- @model, @tokenizer = call_method(:load_model_and_tokenizer, @task, @checkpoint)
77
- end
78
-
79
- def reset_model
80
- init_model
81
- end
82
-
83
- def initialize(task, initial_checkpoint = nil, *args)
84
- super(*args)
85
- @task = task
86
-
87
- @checkpoint = model_file && File.exists?(model_file)? model_file : initial_checkpoint
88
-
89
- init_model
90
-
91
- @locate_tokens = @tokenizer.special_tokens_map["mask_token"] if @task == "MaskedLM"
92
-
93
- @training_args = {}
94
-
95
- train_model do |file,elements,labels|
96
- HuggingfaceModel.run_model(@model, @tokenizer, elements, labels, @training_args, @class_weights)
97
-
98
- @model.save_pretrained(file) if file
99
- @tokenizer.save_pretrained(file) if file
100
- end
101
-
102
- eval_model do |file,elements|
103
- @model, @tokenizer = HuggingfaceModel.call_method(:load_model_and_tokenizer, @task, @checkpoint)
104
- HuggingfaceModel.run_model(@model, @tokenizer, elements, nil, @locate_tokens)
105
- end
106
-
107
- post_process do |result|
108
- if result.respond_to?(:predictions)
109
- single = false
110
- predictions = result.predictions
111
- elsif result["token_positions"]
112
- predictions = result["result"].predictions
113
- token_positions = result["token_positions"]
114
- else
115
- single = true
116
- predictions = result["logits"]
117
- end
118
-
119
- result = case @task
120
- when "SequenceClassification"
121
- RbbtPython.collect(predictions) do |logits|
122
- logits = RbbtPython.numpy2ruby logits
123
- best_class = logits.index logits.max
124
- best_class = @class_labels[best_class] if @class_labels
125
- best_class
126
- end
127
- when "MaskedLM"
128
- all_token_positions = token_positions.to_a
129
-
130
- i = 0
131
- RbbtPython.collect(predictions) do |item_logits|
132
- item_token_positions = all_token_positions[i]
133
- i += 1
134
-
135
- item_logits = RbbtPython.numpy2ruby(item_logits)
136
- item_masks = item_token_positions.collect do |token_positions|
137
-
138
- best = item_logits.values_at(*token_positions).collect do |logits|
139
- best_token, best_score = nil
140
- logits.each_with_index do |v,i|
141
- if best_score.nil? || v > best_score
142
- best_token, best_score = i, v
143
- end
144
- end
145
- best_token
146
- end
147
-
148
- best.collect{|b| @tokenizer.decode(b) } * "|"
149
- end
150
- Array === @locate_tokens ? item_masks : item_masks.first
151
- end
152
- else
153
- logits
154
- end
155
-
156
- single ? result.first : result
157
- end
158
- end
159
- end
160
-