rbbt-dm 1.2.9 → 1.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/lib/rbbt/vector/model/huggingface.rb +10 -40
- data/lib/rbbt/vector/model/python.rb +33 -0
- data/lib/rbbt/vector/model/pytorch_lightning.rb +19 -23
- data/lib/rbbt/vector/model/torch/dataloader.rb +58 -0
- data/lib/rbbt/vector/model/torch/helpers.rb +52 -0
- data/lib/rbbt/vector/model/torch/introspection.rb +31 -0
- data/lib/rbbt/vector/model/torch/load_and_save.rb +30 -0
- data/lib/rbbt/vector/model/torch.rb +60 -26
- data/lib/rbbt/vector/model.rb +2 -2
- data/python/rbbt_dm/__init__.py +4 -21
- data/python/rbbt_dm/huggingface.py +9 -4
- data/python/rbbt_dm/util.py +2 -0
- data/test/rbbt/vector/model/test_huggingface.rb +2 -2
- data/test/rbbt/vector/model/test_python.rb +31 -0
- data/test/rbbt/vector/model/test_pytorch_lightning.rb +80 -66
- data/test/rbbt/vector/model/test_torch.rb +61 -0
- metadata +12 -3
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: f9b8071884e4e9d7a8c04f175fe262aad9e2b77911dca787a957a5c5f797fb9b
|
4
|
+
data.tar.gz: 1c7334d62036d3ae07b7f625b310f401b5078022f909be34cd78bb66c5b2af06
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 22c73d01543e93a2a7b10ecaa88db9a663b35c8264b6d0e5e9d4b00096f34955250105dec4787242529c594c1a959feb23a4b5cd46298850eee7a813dc551d0f
|
7
|
+
data.tar.gz: 545663b2ee93dd0e6e6b54e353cb3bfafab9001c7031b42e7f895fb95ea85ffb6c1dcdb54bb671ee5cace49561cca018212e25ee43592b457e4e1abe83277076
|
@@ -2,53 +2,23 @@ require 'rbbt/vector/model/torch'
|
|
2
2
|
|
3
3
|
class HuggingfaceModel < TorchModel
|
4
4
|
|
5
|
-
def self.tsv_dataset(tsv_dataset_file, elements, labels = nil, class_labels = nil)
|
6
|
-
|
7
|
-
if labels
|
8
|
-
labels = case class_labels
|
9
|
-
when Array
|
10
|
-
labels.collect{|l| class_labels.index l}
|
11
|
-
when Hash
|
12
|
-
inverse_class_labels = {}
|
13
|
-
class_labels.each{|c,l| inverse_class_labels[l] = c }
|
14
|
-
labels.collect{|l| inverse_class_labels[l]}
|
15
|
-
else
|
16
|
-
labels
|
17
|
-
end
|
18
|
-
|
19
|
-
Open.write(tsv_dataset_file) do |ffile|
|
20
|
-
ffile.puts ["label", "text"].flatten * "\t"
|
21
|
-
elements.zip(labels).each do |element,label|
|
22
|
-
element = element.gsub("\n", " ")
|
23
|
-
ffile.puts [label, element].flatten * "\t"
|
24
|
-
end
|
25
|
-
ffile.sync
|
26
|
-
end
|
27
|
-
else
|
28
|
-
Open.write(tsv_dataset_file) do |ffile|
|
29
|
-
ffile.puts ["text"].flatten * "\t"
|
30
|
-
elements.each do |element|
|
31
|
-
element = element.gsub("\n", " ")
|
32
|
-
ffile.puts element
|
33
|
-
end
|
34
|
-
ffile.sync
|
35
|
-
end
|
36
|
-
end
|
37
|
-
|
38
|
-
tsv_dataset_file
|
39
|
-
end
|
40
|
-
|
41
5
|
def initialize(task, checkpoint, dir = nil, model_options = {})
|
42
|
-
super(dir, model_options)
|
6
|
+
super(dir, nil, model_options)
|
7
|
+
|
8
|
+
checkpoint = checkpoint.find if Path === checkpoint
|
43
9
|
|
44
10
|
@model_options = Misc.add_defaults @model_options, :task => task, :checkpoint => checkpoint
|
45
11
|
|
46
12
|
init_model do
|
47
13
|
checkpoint = @model_path && File.directory?(@model_path) ? @model_path : @model_options[:checkpoint]
|
14
|
+
|
48
15
|
model = RbbtPython.call_method("rbbt_dm.huggingface", :load_model,
|
49
16
|
@model_options[:task], checkpoint, **(IndiferentHash.setup(model_options[:model_args]) || {}))
|
17
|
+
|
18
|
+
tokenizer_checkpoint = @model_options[:tokenizer_checkpoint] || checkpoint
|
19
|
+
|
50
20
|
tokenizer = RbbtPython.call_method("rbbt_dm.huggingface", :load_tokenizer,
|
51
|
-
@model_options[:task],
|
21
|
+
@model_options[:task], tokenizer_checkpoint, **(IndiferentHash.setup(model_options[:tokenizer_args]) || {}))
|
52
22
|
|
53
23
|
[model, tokenizer]
|
54
24
|
end
|
@@ -75,7 +45,7 @@ class HuggingfaceModel < TorchModel
|
|
75
45
|
checkpoint_dir = File.join(tmpdir, 'checkpoints')
|
76
46
|
end
|
77
47
|
|
78
|
-
dataset_file =
|
48
|
+
dataset_file = TorchModel.text_dataset(tsv_file, texts)
|
79
49
|
training_args_obj = RbbtPython.call_method("rbbt_dm.huggingface", :training_args, checkpoint_dir, @model_options[:training_args])
|
80
50
|
|
81
51
|
begin
|
@@ -102,7 +72,7 @@ class HuggingfaceModel < TorchModel
|
|
102
72
|
end
|
103
73
|
|
104
74
|
training_args_obj = RbbtPython.call_method("rbbt_dm.huggingface", :training_args, checkpoint_dir, @model_options[:training_args])
|
105
|
-
dataset_file = HuggingfaceModel.
|
75
|
+
dataset_file = HuggingfaceModel.text_dataset(tsv_file, texts, labels, @model_options[:class_labels])
|
106
76
|
|
107
77
|
RbbtPython.call_method("rbbt_dm.huggingface", :train_model, model, tokenizer, training_args_obj, dataset_file, @model_options[:class_weights])
|
108
78
|
|
@@ -0,0 +1,33 @@
|
|
1
|
+
require 'rbbt/vector/model'
|
2
|
+
require 'rbbt/util/python'
|
3
|
+
|
4
|
+
RbbtPython.add_path Rbbt.python.find(:lib)
|
5
|
+
RbbtPython.init_rbbt
|
6
|
+
|
7
|
+
class PythonModel < VectorModel
|
8
|
+
attr_accessor :python_class, :python_module
|
9
|
+
def initialize(dir, python_class = nil, python_module = nil, model_options = nil)
|
10
|
+
python_module = :model if python_module.nil?
|
11
|
+
model_options, python_module = python_module, :model if model_options.nil? && Hash === python_module
|
12
|
+
model_options = {} if model_options.nil?
|
13
|
+
|
14
|
+
super(dir, model_options)
|
15
|
+
|
16
|
+
@python_class = python_class
|
17
|
+
@python_module = python_module
|
18
|
+
|
19
|
+
init_model do
|
20
|
+
RbbtPython.add_path @directory
|
21
|
+
RbbtPython.class_new_obj(@python_module, @python_class, **model_options)
|
22
|
+
end if python_class
|
23
|
+
|
24
|
+
eval_model do |features,list=false|
|
25
|
+
init
|
26
|
+
if list
|
27
|
+
model.eval(features)
|
28
|
+
else
|
29
|
+
model.eval([features])[0]
|
30
|
+
end
|
31
|
+
end
|
32
|
+
end
|
33
|
+
end
|
@@ -2,34 +2,30 @@ require 'rbbt/vector/model/torch'
|
|
2
2
|
|
3
3
|
class PytorchLightningModel < TorchModel
|
4
4
|
attr_accessor :loader, :val_loader, :trainer
|
5
|
-
def initialize(
|
6
|
-
super(
|
7
|
-
@module_name = module_name
|
8
|
-
@class_name = class_name
|
9
|
-
|
10
|
-
init_model do
|
11
|
-
RbbtPython.pyimport @module_name
|
12
|
-
RbbtPython.class_new_obj(@module_name, @class_name, @model_options[:model_args] || {})
|
13
|
-
end
|
5
|
+
def initialize(...)
|
6
|
+
super(...)
|
14
7
|
|
15
8
|
train_model do |features,labels|
|
16
9
|
model = init
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
model.call(RbbtPython.call_method(:torch, :tensor, features))
|
26
|
-
else
|
27
|
-
model.call(RbbtPython.call_method(:torch, :tensor, [features]))
|
10
|
+
loader = self.loader
|
11
|
+
val_loader = self.val_loader
|
12
|
+
if (features && features.any?) && loader.nil?
|
13
|
+
TmpFile.with_file do |tsv_dataset_file|
|
14
|
+
TorchModel.feature_dataset(tsv_dataset_file, features, labels)
|
15
|
+
RbbtPython.pyimport :rbbt_dm
|
16
|
+
loader = RbbtPython.rbbt_dm.tsv(tsv_dataset_file)
|
17
|
+
end
|
28
18
|
end
|
19
|
+
trainer.fit(model, loader, val_loader)
|
20
|
+
TorchModel.save_architecture(model, model_path) if @directory
|
21
|
+
TorchModel.save_state(model, model_path) if @directory
|
29
22
|
end
|
30
|
-
|
31
23
|
end
|
32
|
-
end
|
33
24
|
|
34
|
-
|
25
|
+
def trainer
|
26
|
+
@trainer ||= begin
|
27
|
+
options = @model_options[:training_args] || @model_options[:trainer_args]
|
28
|
+
RbbtPython.class_new_obj("pytorch_lightning", "Trainer", options || {})
|
29
|
+
end
|
30
|
+
end
|
35
31
|
end
|
@@ -0,0 +1,58 @@
|
|
1
|
+
class TorchModel
|
2
|
+
def self.feature_tsv(elements, labels = nil, class_labels = nil)
|
3
|
+
tsv = TSV.setup({}, :key_field => "ID", :fields => ["features"], :type => :flat)
|
4
|
+
if labels
|
5
|
+
tsv.fields = tsv.fields + ["label"]
|
6
|
+
labels = case class_labels
|
7
|
+
when Array
|
8
|
+
labels.collect{|l| class_labels.index l}
|
9
|
+
when Hash
|
10
|
+
inverse_class_labels = {}
|
11
|
+
class_labels.each{|c,l| inverse_class_labels[l] = c }
|
12
|
+
labels.collect{|l| inverse_class_labels[l]}
|
13
|
+
else
|
14
|
+
labels
|
15
|
+
end
|
16
|
+
elements.zip(labels).each_with_index do |p,i|
|
17
|
+
features, label = p
|
18
|
+
id = i
|
19
|
+
if Array === features
|
20
|
+
tsv[id] = features + [label]
|
21
|
+
else
|
22
|
+
tsv[id] = [features, label]
|
23
|
+
end
|
24
|
+
end
|
25
|
+
else
|
26
|
+
elements.each_with_index do |features,i|
|
27
|
+
id = i
|
28
|
+
if Array === features
|
29
|
+
tsv[id] = features
|
30
|
+
else
|
31
|
+
tsv[id] = [features]
|
32
|
+
end
|
33
|
+
end
|
34
|
+
end
|
35
|
+
tsv
|
36
|
+
end
|
37
|
+
|
38
|
+
def self.feature_dataset(tsv_dataset_file, elements, labels = nil, class_labels = nil)
|
39
|
+
tsv = feature_tsv(elements, labels, class_labels)
|
40
|
+
Open.write(tsv_dataset_file, tsv.to_s)
|
41
|
+
tsv_dataset_file
|
42
|
+
end
|
43
|
+
|
44
|
+
def self.text_dataset(tsv_dataset_file, elements, labels = nil, class_labels = nil)
|
45
|
+
elements = elements.collect{|e| e.gsub("\n", ' ') }
|
46
|
+
tsv = feature_tsv(elements, labels, class_labels)
|
47
|
+
if labels.nil?
|
48
|
+
tsv.fields[0] = "text"
|
49
|
+
tsv.type = :single
|
50
|
+
else
|
51
|
+
tsv.fields[0] = "text"
|
52
|
+
tsv.type = :list
|
53
|
+
end
|
54
|
+
Open.write(tsv_dataset_file, tsv.to_s)
|
55
|
+
tsv_dataset_file
|
56
|
+
end
|
57
|
+
|
58
|
+
end
|
@@ -0,0 +1,52 @@
|
|
1
|
+
class TorchModel
|
2
|
+
module Tensor
|
3
|
+
def to_ruby
|
4
|
+
RbbtPython.numpy2ruby(self)
|
5
|
+
end
|
6
|
+
def self.setup(obj)
|
7
|
+
obj.extend Tensor
|
8
|
+
end
|
9
|
+
end
|
10
|
+
|
11
|
+
def self.init_python
|
12
|
+
RbbtPython.pyimport :torch
|
13
|
+
RbbtPython.pyimport :rbbt
|
14
|
+
RbbtPython.pyimport :rbbt_dm
|
15
|
+
RbbtPython.pyfrom :rbbt_dm, import: :util
|
16
|
+
RbbtPython.pyfrom :torch, import: :nn
|
17
|
+
end
|
18
|
+
|
19
|
+
def self.optimizer(model, training_args)
|
20
|
+
begin
|
21
|
+
learning_rate = training_args[:learning_rate] || 0.01
|
22
|
+
RbbtPython.torch.optim.SGD.new(model.parameters(), lr: learning_rate)
|
23
|
+
end
|
24
|
+
end
|
25
|
+
|
26
|
+
def self.device(model_options)
|
27
|
+
case model_options[:device]
|
28
|
+
when String, Symbol
|
29
|
+
RbbtPython.torch.device(model_options[:device].to_s)
|
30
|
+
when nil
|
31
|
+
RbbtPython.rbbt_dm.util.device()
|
32
|
+
else
|
33
|
+
model_options[:device]
|
34
|
+
end
|
35
|
+
end
|
36
|
+
|
37
|
+
def self.dtype(model_options)
|
38
|
+
case model_options[:dtype]
|
39
|
+
when String, Symbol
|
40
|
+
RbbtPython.torch.call(model_options[:dtype])
|
41
|
+
when nil
|
42
|
+
nil
|
43
|
+
else
|
44
|
+
model_options[:dtype]
|
45
|
+
end
|
46
|
+
end
|
47
|
+
|
48
|
+
def self.tensor(obj, device, dtype)
|
49
|
+
RbbtPython.torch.tensor(obj, dtype: dtype, device: device)
|
50
|
+
end
|
51
|
+
|
52
|
+
end
|
@@ -0,0 +1,31 @@
|
|
1
|
+
class TorchModel
|
2
|
+
def self.get_layer(model, layer = nil)
|
3
|
+
if layer.nil?
|
4
|
+
model
|
5
|
+
else
|
6
|
+
layer.split(".").inject(model){|acc,l| PyCall.getattr(acc, l.to_sym) }
|
7
|
+
end
|
8
|
+
end
|
9
|
+
def get_layer(...); TorchModel.get_layer(model, ...); end
|
10
|
+
|
11
|
+
def self.get_weights(model, layer = nil)
|
12
|
+
Tensor.setup PyCall.getattr(get_layer(model, layer), :weight)
|
13
|
+
end
|
14
|
+
def get_weights(...); TorchModel.get_weights(model, ...); end
|
15
|
+
|
16
|
+
def self.freeze(layer)
|
17
|
+
begin
|
18
|
+
PyCall.getattr(layer, :weight).requires_grad = false
|
19
|
+
rescue
|
20
|
+
end
|
21
|
+
RbbtPython.iterate(layer.children) do |layer|
|
22
|
+
freeze(layer)
|
23
|
+
end
|
24
|
+
end
|
25
|
+
def self.freeze_layer(model, layer)
|
26
|
+
layer = get_layer(model, layer)
|
27
|
+
freeze(layer)
|
28
|
+
end
|
29
|
+
def freeze_layer(...); TorchModel.freeze_layer(model, ...); end
|
30
|
+
|
31
|
+
end
|
@@ -0,0 +1,30 @@
|
|
1
|
+
class TorchModel
|
2
|
+
def self.model_architecture(model_path)
|
3
|
+
model_path + '.architecture'
|
4
|
+
end
|
5
|
+
|
6
|
+
def self.save_state(model, model_path)
|
7
|
+
Log.debug "Saving model state into #{model_path}"
|
8
|
+
RbbtPython.torch.save(model.state_dict(), model_path)
|
9
|
+
end
|
10
|
+
|
11
|
+
def self.load_state(model, model_path)
|
12
|
+
return model unless Open.exists?(model_path)
|
13
|
+
Log.debug "Loading model state from #{model_path}"
|
14
|
+
model.load_state_dict(RbbtPython.torch.load(model_path))
|
15
|
+
model
|
16
|
+
end
|
17
|
+
|
18
|
+
def self.save_architecture(model, model_path)
|
19
|
+
model_architecture = model_architecture(model_path)
|
20
|
+
Log.debug "Saving model architecture into #{model_architecture}"
|
21
|
+
RbbtPython.torch.save(model, model_architecture)
|
22
|
+
end
|
23
|
+
|
24
|
+
def self.load_architecture(model_path)
|
25
|
+
model_architecture = model_architecture(model_path)
|
26
|
+
return unless Open.exists?(model_architecture)
|
27
|
+
Log.debug "Loading model architecture from #{model_architecture}"
|
28
|
+
RbbtPython.torch.load(model_architecture)
|
29
|
+
end
|
30
|
+
end
|
@@ -1,37 +1,71 @@
|
|
1
|
-
|
2
|
-
require 'rbbt/util/python'
|
1
|
+
require_relative 'python'
|
3
2
|
|
4
|
-
|
5
|
-
RbbtPython.init_rbbt
|
3
|
+
class TorchModel < PythonModel
|
6
4
|
|
7
|
-
|
5
|
+
attr_accessor :model, :criterion, :optimizer, :training_args
|
8
6
|
|
9
|
-
|
7
|
+
def initialize(...)
|
8
|
+
TorchModel.init_python
|
9
|
+
super(...)
|
10
|
+
@training_args = model_options[:training_args] || {}
|
10
11
|
|
11
|
-
|
12
|
-
|
13
|
-
|
12
|
+
init_model do
|
13
|
+
model = TorchModel.load_architecture(model_path)
|
14
|
+
if model.nil?
|
15
|
+
RbbtPython.add_path @directory
|
16
|
+
RbbtPython.class_new_obj(@python_module, @python_class, **model_options)
|
17
|
+
else
|
18
|
+
TorchModel.load_state(model, model_path)
|
19
|
+
end
|
20
|
+
end
|
14
21
|
|
15
|
-
|
16
|
-
|
17
|
-
|
22
|
+
eval_model do |features,list=false|
|
23
|
+
init
|
24
|
+
@device ||= TorchModel.device(model_options)
|
25
|
+
@dtype ||= TorchModel.dtype(model_options)
|
26
|
+
model.to(@device)
|
18
27
|
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
28
|
+
tensor = list ? TorchModel.tensor(features, @device, @dtype) : TorchModel.tensor([features], @device, @dtype)
|
29
|
+
|
30
|
+
loss, res = model.call(tensor)
|
31
|
+
|
32
|
+
res = loss if res.nil?
|
33
|
+
|
34
|
+
res = TorchModel::Tensor.setup(list ? res : res[0])
|
35
|
+
|
36
|
+
res
|
26
37
|
end
|
27
|
-
end
|
28
38
|
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
39
|
+
train_model do |features,labels|
|
40
|
+
init
|
41
|
+
@device ||= TorchModel.device(model_options)
|
42
|
+
@dtype ||= TorchModel.dtype(model_options)
|
43
|
+
model.to(@device)
|
44
|
+
@optimizer ||= TorchModel.optimizer(model, training_args)
|
45
|
+
epochs = training_args[:epochs] || 3
|
46
|
+
|
47
|
+
inputs = TorchModel.tensor(features, @device, @dtype)
|
48
|
+
#target = TorchModel.tensor(labels.collect{|v| [v] }, @device, @dtype)
|
49
|
+
target = TorchModel.tensor(labels, @device, @dtype)
|
33
50
|
|
34
|
-
|
35
|
-
|
51
|
+
Log::ProgressBar.with_bar epochs, :desc => "Training" do |bar|
|
52
|
+
epochs.times do |i|
|
53
|
+
@optimizer.zero_grad()
|
54
|
+
outputs = model.call(inputs)
|
55
|
+
outputs = outputs.squeeze() if target.dim() == 1
|
56
|
+
loss = criterion.call(outputs, target)
|
57
|
+
loss.backward()
|
58
|
+
@optimizer.step
|
59
|
+
Log.debug "Epoch #{i}, loss #{loss}"
|
60
|
+
bar.tick
|
61
|
+
end
|
62
|
+
end
|
63
|
+
TorchModel.save_architecture(model, model_path) if @directory
|
64
|
+
TorchModel.save_state(model, model_path) if @directory
|
65
|
+
end
|
36
66
|
end
|
37
67
|
end
|
68
|
+
require_relative 'torch/helpers'
|
69
|
+
require_relative 'torch/dataloader'
|
70
|
+
require_relative 'torch/introspection'
|
71
|
+
require_relative 'torch/load_and_save'
|
data/lib/rbbt/vector/model.rb
CHANGED
@@ -448,10 +448,10 @@ cat(paste(label, sep="\\n", collapse="\\n"));
|
|
448
448
|
end
|
449
449
|
|
450
450
|
test_set = feature_folds[fix]
|
451
|
-
train_set = feature_folds.values_at(*rest).
|
451
|
+
train_set = feature_folds.values_at(*rest).flatten(1)
|
452
452
|
|
453
453
|
test_labels = labels_folds[fix]
|
454
|
-
train_labels = labels_folds.values_at(*rest).flatten
|
454
|
+
train_labels = labels_folds.values_at(*rest).flatten(1)
|
455
455
|
|
456
456
|
@features = train_set
|
457
457
|
@labels = train_labels
|
data/python/rbbt_dm/__init__.py
CHANGED
@@ -1,6 +1,8 @@
|
|
1
|
-
|
1
|
+
import rbbt
|
2
|
+
import torch
|
3
|
+
from .util import *
|
2
4
|
|
3
|
-
class TSVDataset(Dataset):
|
5
|
+
class TSVDataset(torch.utils.data.Dataset):
|
4
6
|
def __init__(self, tsv):
|
5
7
|
self.tsv = tsv
|
6
8
|
|
@@ -20,29 +22,10 @@ class TSVDataset(Dataset):
|
|
20
22
|
return len(self.tsv)
|
21
23
|
|
22
24
|
def tsv_dataset(filename, *args, **kwargs):
|
23
|
-
import rbbt
|
24
25
|
return TSVDataset(rbbt.tsv(filename, *args, **kwargs))
|
25
26
|
|
26
27
|
def tsv(*args, **kwargs):
|
27
28
|
return tsv_dataset(*args, **kwargs)
|
28
29
|
|
29
30
|
def data_dir():
|
30
|
-
import rbbt
|
31
31
|
return rbbt.path('var/rbbt_dm/data')
|
32
|
-
|
33
|
-
if __name__ == "__main__":
|
34
|
-
import rbbt
|
35
|
-
|
36
|
-
filename = "/home/miki/test/numeric.tsv"
|
37
|
-
ds = tsv(filename)
|
38
|
-
|
39
|
-
dl = DataLoader(ds, batch_size=1)
|
40
|
-
|
41
|
-
for f, l in iter(dl):
|
42
|
-
print(".")
|
43
|
-
print(f[0,:])
|
44
|
-
print(l[0])
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
|
@@ -1,4 +1,6 @@
|
|
1
1
|
#{{{ LOAD MODEL
|
2
|
+
import datasets
|
3
|
+
import rbbt
|
2
4
|
|
3
5
|
def import_module_class(module, class_name):
|
4
6
|
if (not module == None):
|
@@ -57,12 +59,15 @@ def eval_model(model, tokenizer, texts, return_logits = True):
|
|
57
59
|
#{{{ TRAIN AND PREDICT
|
58
60
|
|
59
61
|
def load_tsv(tsv_file):
|
60
|
-
|
61
|
-
|
62
|
+
tsv = rbbt.tsv(tsv_file)
|
63
|
+
print(tsv)
|
64
|
+
ds = datasets.Dataset.from_pandas(tsv)
|
65
|
+
d = datasets.DatasetDict()
|
66
|
+
d["train"] = ds
|
67
|
+
return d
|
62
68
|
|
63
69
|
def load_json(json_file):
|
64
|
-
|
65
|
-
return load_dataset('json', data_files=[json_file])
|
70
|
+
return datasets.load_dataset('json', data_files=[json_file])
|
66
71
|
|
67
72
|
def tokenize_dataset(tokenizer, dataset):
|
68
73
|
return dataset.map(lambda subset: subset if ("input_ids" in subset.keys()) else tokenizer(subset["text"], truncation=True), batched=True)
|
data/python/rbbt_dm/util.py
CHANGED
@@ -42,7 +42,7 @@ class TestHuggingface < Test::Unit::TestCase
|
|
42
42
|
assert_equal 5, tokenizer.call("This is a sentence that has several words", truncation: true)["input_ids"].__len__
|
43
43
|
end
|
44
44
|
|
45
|
-
def
|
45
|
+
def test_sst_eval
|
46
46
|
TmpFile.with_file do |dir|
|
47
47
|
checkpoint = "distilbert-base-uncased-finetuned-sst-2-english"
|
48
48
|
|
@@ -56,7 +56,7 @@ class TestHuggingface < Test::Unit::TestCase
|
|
56
56
|
end
|
57
57
|
|
58
58
|
|
59
|
-
def
|
59
|
+
def _test_sst_train
|
60
60
|
TmpFile.with_file do |dir|
|
61
61
|
checkpoint = "distilbert-base-uncased-finetuned-sst-2-english"
|
62
62
|
|
@@ -0,0 +1,31 @@
|
|
1
|
+
require File.expand_path(__FILE__).sub(%r(/test/.*), '/test/test_helper.rb')
|
2
|
+
require File.expand_path(__FILE__).sub(%r(.*/test/), '').sub(/test_(.*)\.rb/,'\1')
|
3
|
+
|
4
|
+
class TestPythonModel < Test::Unit::TestCase
|
5
|
+
def test_linear
|
6
|
+
model = nil
|
7
|
+
|
8
|
+
TmpFile.with_dir do |dir|
|
9
|
+
|
10
|
+
Misc.in_dir dir do
|
11
|
+
Open.write 'model.py', <<-EOF
|
12
|
+
class TestModel:
|
13
|
+
def __init__(self, delta):
|
14
|
+
self.delta = delta
|
15
|
+
|
16
|
+
def eval(self, x):
|
17
|
+
return [e + self.delta for e in x]
|
18
|
+
EOF
|
19
|
+
model = PythonModel.new dir, 'TestModel', :model, delta: 1
|
20
|
+
|
21
|
+
assert_equal 2, model.eval(1)
|
22
|
+
assert_equal [4, 6], model.eval_list([3, 5])
|
23
|
+
|
24
|
+
model = PythonModel.new dir, 'TestModel', :model, delta: 2
|
25
|
+
|
26
|
+
assert_equal 3, model.eval(1)
|
27
|
+
end
|
28
|
+
end
|
29
|
+
end
|
30
|
+
end
|
31
|
+
|
@@ -2,82 +2,96 @@ require File.join(File.expand_path(File.dirname(__FILE__)), '../../..', 'test_he
|
|
2
2
|
require 'rbbt/vector/model/pytorch_lightning'
|
3
3
|
|
4
4
|
class TestPytorchLightning < Test::Unit::TestCase
|
5
|
-
def
|
6
|
-
|
7
|
-
|
8
|
-
|
9
|
-
data = TSV.setup({}, :key_field => "Gene", :fields => samples + ["cluster"], :type => :list, :cast => :to_f)
|
10
|
-
|
11
|
-
profiles = []
|
12
|
-
p0 = 3
|
13
|
-
p1 = 7
|
14
|
-
profiles[0] = nsamples.times.collect{ rand() + p0 }
|
15
|
-
profiles[1] = nsamples.times.collect{ rand() + p1 }
|
16
|
-
|
17
|
-
ngenes.times do |genen|
|
18
|
-
gene = "Gene-#{genen}"
|
19
|
-
cluster = genen % 2
|
20
|
-
values = profiles[cluster].collect do |m|
|
21
|
-
rand() + m
|
22
|
-
end
|
23
|
-
data[gene] = values + [cluster]
|
24
|
-
end
|
5
|
+
def test_regresion
|
6
|
+
points = 10
|
7
|
+
a = 1
|
8
|
+
b = 1
|
25
9
|
|
10
|
+
x = (0..points - 1)
|
11
|
+
y = points.times.collect{|p| p }
|
12
|
+
|
26
13
|
python = <<~EOF
|
27
|
-
import torch
|
28
|
-
from torch import nn
|
29
|
-
from torch.nn import functional as F
|
30
|
-
from torch.utils.data import DataLoader
|
31
|
-
from torch.utils.data import random_split
|
32
|
-
from torchvision.datasets import MNIST
|
33
|
-
from torchvision import transforms
|
34
14
|
import pytorch_lightning as pl
|
15
|
+
import numpy as np
|
16
|
+
import torch
|
17
|
+
from torch.nn import MSELoss
|
18
|
+
from torch.optim import Adam
|
19
|
+
from torch.utils.data import DataLoader, Dataset
|
20
|
+
import torch.nn as nn
|
21
|
+
|
22
|
+
|
23
|
+
class SimpleDataset(Dataset):
|
24
|
+
def __init__(self):
|
25
|
+
X = np.arange(10000)
|
26
|
+
y = X * 2
|
27
|
+
X = [[_] for _ in X]
|
28
|
+
y = [[_] for _ in y]
|
29
|
+
self.X = torch.Tensor(X)
|
30
|
+
self.y = torch.Tensor(y)
|
31
|
+
|
32
|
+
def __len__(self):
|
33
|
+
return len(self.y)
|
34
|
+
|
35
|
+
def __getitem__(self, idx):
|
36
|
+
return {"X": self.X[idx], "y": self.y[idx]}
|
37
|
+
|
35
38
|
|
36
39
|
class TestPytorchLightningModel(pl.LightningModule):
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
|
51
|
-
|
52
|
-
x, y = train_batch
|
53
|
-
x = x.to(self.dtype)
|
54
|
-
y = y.to(self.dtype)
|
55
|
-
y_hat = self.model(x).squeeze()
|
56
|
-
loss = F.mse_loss(y, y_hat)
|
57
|
-
self.log('train_loss', loss)
|
58
|
-
return loss
|
59
|
-
|
60
|
-
@torch.cuda.amp.custom_fwd(cast_inputs=torch.float64)
|
61
|
-
def validation_step(self, val_batch, batch_idx):
|
62
|
-
x, y = train_batch
|
63
|
-
y_hat = self.model(x)
|
64
|
-
loss = F.mse_loss(y, y_hat)
|
65
|
-
self.log('val_loss', loss)
|
40
|
+
def __init__(self):
|
41
|
+
super().__init__()
|
42
|
+
self.fc = nn.Linear(1, 1)
|
43
|
+
self.criterion = MSELoss()
|
44
|
+
|
45
|
+
def forward(self, inputs, labels=None):
|
46
|
+
outputs = self.fc(inputs)
|
47
|
+
loss = 0
|
48
|
+
if labels is not None:
|
49
|
+
loss = self.criterion(outputs, labels)
|
50
|
+
return loss, outputs
|
51
|
+
|
52
|
+
def train_dataloader(self):
|
53
|
+
dataset = SimpleDataset()
|
54
|
+
return DataLoader(dataset, batch_size=1000)
|
66
55
|
|
56
|
+
def training_step(self, batch, batch_idx):
|
57
|
+
input_ids = batch["X"]
|
58
|
+
labels = batch["y"]
|
59
|
+
loss, outputs = self(input_ids, labels)
|
60
|
+
return {"loss": loss}
|
61
|
+
|
62
|
+
def configure_optimizers(self):
|
63
|
+
optimizer = Adam(self.parameters(), lr=0.1)
|
64
|
+
return optimizer
|
67
65
|
EOF
|
68
66
|
|
69
|
-
|
70
|
-
|
71
|
-
|
72
|
-
|
73
|
-
|
74
|
-
|
75
|
-
|
67
|
+
TmpFile.with_dir do |dir|
|
68
|
+
Open.write(File.join(dir, 'model.py'), python)
|
69
|
+
model = PytorchLightningModel.new dir, "TestPytorchLightningModel"
|
70
|
+
model.init
|
71
|
+
|
72
|
+
model.trainer = RbbtPython.class_new_obj("pytorch_lightning", "Trainer", max_epochs: 10, precision: 16)
|
73
|
+
model.init
|
74
|
+
|
76
75
|
model.train
|
77
|
-
|
78
|
-
|
76
|
+
|
77
|
+
w = model.get_weights('fc').to_ruby.first.first
|
78
|
+
|
79
|
+
assert w > 1.8
|
80
|
+
assert w < 2.2
|
81
|
+
|
82
|
+
res = model.eval(10.0)
|
83
|
+
assert_equal res, (10 * w)
|
84
|
+
assert res > 1.8 * 10.0
|
85
|
+
assert res < 2.2 * 10.0
|
86
|
+
|
87
|
+
res = model.eval([10.0])
|
88
|
+
res = model.eval_list([[10.0], [11.2], [14.3]])
|
89
|
+
assert_equal 3, RbbtPython.numpy2ruby(res).length
|
90
|
+
|
91
|
+
model = VectorModel.new dir
|
92
|
+
model.init
|
93
|
+
|
79
94
|
end
|
80
95
|
end
|
81
|
-
|
82
96
|
end
|
83
97
|
|
@@ -0,0 +1,61 @@
|
|
1
|
+
require File.expand_path(__FILE__).sub(%r(/test/.*), '/test/test_helper.rb')
|
2
|
+
require File.expand_path(__FILE__).sub(%r(.*/test/), '').sub(/test_(.*)\.rb/,'\1')
|
3
|
+
|
4
|
+
class TestTorch < Test::Unit::TestCase
|
5
|
+
def test_linear
|
6
|
+
model = nil
|
7
|
+
|
8
|
+
TmpFile.with_dir do |dir|
|
9
|
+
|
10
|
+
# Create model
|
11
|
+
|
12
|
+
model = TorchModel.new dir
|
13
|
+
model.model = RbbtPython.torch.nn.Linear.new(1, 1)
|
14
|
+
model.criterion = RbbtPython.torch.nn.MSELoss.new()
|
15
|
+
|
16
|
+
model.extract_features do |f|
|
17
|
+
[f]
|
18
|
+
end
|
19
|
+
|
20
|
+
model.post_process do |v,list|
|
21
|
+
list ? v.to_ruby.collect{|vv| vv.first } : v.to_ruby.first
|
22
|
+
end
|
23
|
+
|
24
|
+
# Train model
|
25
|
+
|
26
|
+
model.add 5.0, [10.0]
|
27
|
+
model.add 10.0, [20.0]
|
28
|
+
|
29
|
+
model.training_args[:epochs] = 1000
|
30
|
+
model.train
|
31
|
+
|
32
|
+
w = model.get_weights.to_ruby.first.first
|
33
|
+
|
34
|
+
assert w > 1.8
|
35
|
+
assert w < 2.2
|
36
|
+
|
37
|
+
# Load the model again
|
38
|
+
|
39
|
+
model = VectorModel.new dir
|
40
|
+
|
41
|
+
# Test model
|
42
|
+
|
43
|
+
y = model.eval(100.0)
|
44
|
+
|
45
|
+
assert(y > 150.0)
|
46
|
+
assert(y < 250.0)
|
47
|
+
|
48
|
+
test = [1.0, 5.0, 10.0, 20.0]
|
49
|
+
input_sum = Misc.sum(test)
|
50
|
+
sum = Misc.sum(model.eval_list(test))
|
51
|
+
assert sum > 0.8 * input_sum * 2
|
52
|
+
assert sum < 1.2 * input_sum * 2
|
53
|
+
|
54
|
+
w = TorchModel.get_weights(model.model).to_ruby.first.first
|
55
|
+
|
56
|
+
assert w > 1.8
|
57
|
+
assert w < 2.2
|
58
|
+
end
|
59
|
+
end
|
60
|
+
end
|
61
|
+
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: rbbt-dm
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.
|
4
|
+
version: 1.3.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Miguel Vazquez
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2023-
|
11
|
+
date: 2023-12-21 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: rbbt-util
|
@@ -108,12 +108,17 @@ files:
|
|
108
108
|
- lib/rbbt/vector/model.rb
|
109
109
|
- lib/rbbt/vector/model/huggingface.rb
|
110
110
|
- lib/rbbt/vector/model/huggingface/masked_lm.rb
|
111
|
+
- lib/rbbt/vector/model/python.rb
|
111
112
|
- lib/rbbt/vector/model/pytorch_lightning.rb
|
112
113
|
- lib/rbbt/vector/model/random_forest.rb
|
113
114
|
- lib/rbbt/vector/model/spaCy.rb
|
114
115
|
- lib/rbbt/vector/model/svm.rb
|
115
116
|
- lib/rbbt/vector/model/tensorflow.rb
|
116
117
|
- lib/rbbt/vector/model/torch.rb
|
118
|
+
- lib/rbbt/vector/model/torch/dataloader.rb
|
119
|
+
- lib/rbbt/vector/model/torch/helpers.rb
|
120
|
+
- lib/rbbt/vector/model/torch/introspection.rb
|
121
|
+
- lib/rbbt/vector/model/torch/load_and_save.rb
|
117
122
|
- lib/rbbt/vector/model/util.rb
|
118
123
|
- python/rbbt_dm/__init__.py
|
119
124
|
- python/rbbt_dm/atcold/__init__.py
|
@@ -143,10 +148,12 @@ files:
|
|
143
148
|
- test/rbbt/test_stan.rb
|
144
149
|
- test/rbbt/vector/model/huggingface/test_masked_lm.rb
|
145
150
|
- test/rbbt/vector/model/test_huggingface.rb
|
151
|
+
- test/rbbt/vector/model/test_python.rb
|
146
152
|
- test/rbbt/vector/model/test_pytorch_lightning.rb
|
147
153
|
- test/rbbt/vector/model/test_spaCy.rb
|
148
154
|
- test/rbbt/vector/model/test_svm.rb
|
149
155
|
- test/rbbt/vector/model/test_tensorflow.rb
|
156
|
+
- test/rbbt/vector/model/test_torch.rb
|
150
157
|
- test/rbbt/vector/test_model.rb
|
151
158
|
- test/test_helper.rb
|
152
159
|
homepage: http://github.com/mikisvaz/rbbt-phgx
|
@@ -167,7 +174,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
167
174
|
- !ruby/object:Gem::Version
|
168
175
|
version: '0'
|
169
176
|
requirements: []
|
170
|
-
rubygems_version: 3.
|
177
|
+
rubygems_version: 3.5.0.dev
|
171
178
|
signing_key:
|
172
179
|
specification_version: 4
|
173
180
|
summary: Data-mining and statistics
|
@@ -182,9 +189,11 @@ test_files:
|
|
182
189
|
- test/rbbt/test_stan.rb
|
183
190
|
- test/rbbt/vector/model/huggingface/test_masked_lm.rb
|
184
191
|
- test/rbbt/vector/model/test_huggingface.rb
|
192
|
+
- test/rbbt/vector/model/test_python.rb
|
185
193
|
- test/rbbt/vector/model/test_pytorch_lightning.rb
|
186
194
|
- test/rbbt/vector/model/test_spaCy.rb
|
187
195
|
- test/rbbt/vector/model/test_svm.rb
|
188
196
|
- test/rbbt/vector/model/test_tensorflow.rb
|
197
|
+
- test/rbbt/vector/model/test_torch.rb
|
189
198
|
- test/rbbt/vector/test_model.rb
|
190
199
|
- test/test_helper.rb
|