torch-rb 0.1.3

Sign up to get free protection for your applications and to get access to all the features.
Files changed (44) hide show
  1. checksums.yaml +7 -0
  2. data/CHANGELOG.md +28 -0
  3. data/LICENSE.txt +46 -0
  4. data/README.md +426 -0
  5. data/ext/torch/ext.cpp +839 -0
  6. data/ext/torch/extconf.rb +25 -0
  7. data/lib/torch-rb.rb +1 -0
  8. data/lib/torch.rb +422 -0
  9. data/lib/torch/ext.bundle +0 -0
  10. data/lib/torch/inspector.rb +85 -0
  11. data/lib/torch/nn/alpha_dropout.rb +9 -0
  12. data/lib/torch/nn/conv2d.rb +37 -0
  13. data/lib/torch/nn/convnd.rb +41 -0
  14. data/lib/torch/nn/dropout.rb +9 -0
  15. data/lib/torch/nn/dropout2d.rb +9 -0
  16. data/lib/torch/nn/dropout3d.rb +9 -0
  17. data/lib/torch/nn/dropoutnd.rb +15 -0
  18. data/lib/torch/nn/embedding.rb +52 -0
  19. data/lib/torch/nn/feature_alpha_dropout.rb +9 -0
  20. data/lib/torch/nn/functional.rb +100 -0
  21. data/lib/torch/nn/init.rb +30 -0
  22. data/lib/torch/nn/linear.rb +36 -0
  23. data/lib/torch/nn/module.rb +85 -0
  24. data/lib/torch/nn/mse_loss.rb +13 -0
  25. data/lib/torch/nn/parameter.rb +14 -0
  26. data/lib/torch/nn/relu.rb +13 -0
  27. data/lib/torch/nn/sequential.rb +29 -0
  28. data/lib/torch/optim/adadelta.rb +57 -0
  29. data/lib/torch/optim/adagrad.rb +71 -0
  30. data/lib/torch/optim/adam.rb +81 -0
  31. data/lib/torch/optim/adamax.rb +68 -0
  32. data/lib/torch/optim/adamw.rb +82 -0
  33. data/lib/torch/optim/asgd.rb +65 -0
  34. data/lib/torch/optim/lr_scheduler/lr_scheduler.rb +33 -0
  35. data/lib/torch/optim/lr_scheduler/step_lr.rb +17 -0
  36. data/lib/torch/optim/optimizer.rb +62 -0
  37. data/lib/torch/optim/rmsprop.rb +76 -0
  38. data/lib/torch/optim/rprop.rb +68 -0
  39. data/lib/torch/optim/sgd.rb +60 -0
  40. data/lib/torch/tensor.rb +196 -0
  41. data/lib/torch/utils/data/data_loader.rb +27 -0
  42. data/lib/torch/utils/data/tensor_dataset.rb +22 -0
  43. data/lib/torch/version.rb +3 -0
  44. metadata +169 -0
@@ -0,0 +1,85 @@
1
+ module Torch
2
+ module NN
3
+ class Module
4
+ def initialize
5
+ @training = true
6
+ end
7
+
8
+ def inspect
9
+ str = String.new
10
+ str << "#{self.class.name}(\n"
11
+ modules.each do |name, mod|
12
+ str << " (#{name}): #{mod.inspect}\n"
13
+ end
14
+ str << ")"
15
+ end
16
+
17
+ def train(mode = true)
18
+ @training = mode
19
+
20
+ modules.each do |_, mod|
21
+ mod.train(mode)
22
+ end
23
+ end
24
+
25
+ def eval
26
+ train(false)
27
+ end
28
+
29
+ def call(*input)
30
+ forward(*input)
31
+ end
32
+
33
+ # modifies in-place
34
+ def to(device)
35
+ instance_variables.each do |name|
36
+ param = instance_variable_get(name)
37
+ if param.is_a?(Parameter)
38
+ instance_variable_set(name, Parameter.new(param.to(device)))
39
+ end
40
+ end
41
+ modules.each do |_, mod|
42
+ mod.to(device)
43
+ end
44
+ self
45
+ end
46
+
47
+ def parameters
48
+ params = []
49
+ instance_variables.each do |name|
50
+ param = instance_variable_get(name)
51
+ params << param if param.is_a?(Parameter)
52
+ end
53
+ params + modules.flat_map { |_, mod| mod.parameters }
54
+ end
55
+
56
+ def zero_grad
57
+ parameters.each do |param|
58
+ if param.grad
59
+ param.grad.detach!
60
+ param.grad.zero!
61
+ end
62
+ end
63
+ end
64
+
65
+ def method_missing(method, *args, &block)
66
+ modules[method.to_s] || super
67
+ end
68
+
69
+ def respond_to?(method, include_private = false)
70
+ modules.key?(method.to_s) || super
71
+ end
72
+
73
+ private
74
+
75
+ def modules
76
+ modules = {}
77
+ instance_variables.each do |name|
78
+ mod = instance_variable_get(name)
79
+ modules[name[1..-1]] = mod if mod.is_a?(Module)
80
+ end
81
+ modules
82
+ end
83
+ end
84
+ end
85
+ end
@@ -0,0 +1,13 @@
1
+ module Torch
2
+ module NN
3
+ class MSELoss < Module
4
+ def initialize(reduction: "mean")
5
+ @reduction = reduction
6
+ end
7
+
8
+ def forward(input, target)
9
+ F.mse_loss(input, target, reduction: @reduction)
10
+ end
11
+ end
12
+ end
13
+ end
@@ -0,0 +1,14 @@
1
+ module Torch
2
+ module NN
3
+ class Parameter < Tensor
4
+ def self.new(data = nil, requires_grad: true)
5
+ data = Tensor.new unless data
6
+ Tensor._make_subclass(data, requires_grad)
7
+ end
8
+
9
+ def grad
10
+ _grad if _grad_defined
11
+ end
12
+ end
13
+ end
14
+ end
@@ -0,0 +1,13 @@
1
+ module Torch
2
+ module NN
3
+ class ReLU < Module
4
+ def initialize #(inplace: false)
5
+ # @inplace = inplace
6
+ end
7
+
8
+ def forward(input)
9
+ F.relu(input) #, inplace: @inplace)
10
+ end
11
+ end
12
+ end
13
+ end
@@ -0,0 +1,29 @@
1
+ module Torch
2
+ module NN
3
+ class Sequential < Module
4
+ def initialize(*args)
5
+ @modules = {}
6
+ # TODO support hash arg (named modules)
7
+ args.each_with_index do |mod, idx|
8
+ add_module(idx.to_s, mod)
9
+ end
10
+ end
11
+
12
+ def add_module(name, mod)
13
+ # TODO add checks
14
+ @modules[name] = mod
15
+ end
16
+
17
+ def forward(input)
18
+ @modules.values.each do |mod|
19
+ input = mod.call(input)
20
+ end
21
+ input
22
+ end
23
+
24
+ def parameters
25
+ @modules.flat_map { |_, mod| mod.parameters }
26
+ end
27
+ end
28
+ end
29
+ end
@@ -0,0 +1,57 @@
1
+ # ported from https://github.com/pytorch/pytorch/blob/master/torch/optim/adadelta.py
2
+ module Torch
3
+ module Optim
4
+ class Adadelta < Optimizer
5
+ def initialize(params, lr: 1.0, rho: 0.9, eps: 1e-6, weight_decay: 0)
6
+ raise ArgumentError, "Invalid learning rate: #{lr}" if lr < 0
7
+ raise ArgumentError, "Invalid rho value: #{rho}" if rho < 0 || rho > 1
8
+ raise ArgumentError, "Invalid epsilon value: #{eps}" if eps < 0
9
+ raise ArgumentError, "Invalid weight_decay value: #{weight_decay}" if weight_decay < 0
10
+
11
+ defaults = {lr: lr, rho: rho, eps: eps, weight_decay: weight_decay}
12
+ super(params, defaults)
13
+ end
14
+
15
+ def step(closure = nil)
16
+ loss = nil
17
+ if closure
18
+ loss = closure.call
19
+ end
20
+
21
+ @param_groups.each do |group|
22
+ group[:params].each do |p|
23
+ next unless p.grad
24
+ grad = p.grad.data
25
+ if grad.sparse?
26
+ raise Error, "Adadelta does not support sparse gradients"
27
+ end
28
+ state = @state[p]
29
+
30
+ if state.size == 0
31
+ state[:step] = 0
32
+ state[:square_avg] = Torch.zeros_like(p.data)
33
+ state[:acc_delta] = Torch.zeros_like(p.data)
34
+ end
35
+
36
+ square_avg, acc_delta = state[:square_avg], state[:acc_delta]
37
+ rho, eps = group[:rho], group[:eps]
38
+
39
+ state[:step] += 1
40
+
41
+ if group[:weight_decay] != 0
42
+ grad = grad.add(group[:weight_decay], p.data)
43
+ end
44
+
45
+ square_avg.mul!(rho).addcmul!(1 - rho, grad, grad)
46
+ std = square_avg.add(eps).sqrt!
47
+ delta = acc_delta.add(eps).sqrt!.div!(std).mul!(grad)
48
+ p.data.add!(-group[:lr], delta)
49
+ acc_delta.mul!(rho).addcmul!(1 - rho, delta, delta)
50
+ end
51
+ end
52
+
53
+ loss
54
+ end
55
+ end
56
+ end
57
+ end
@@ -0,0 +1,71 @@
1
+ # ported from https://github.com/pytorch/pytorch/blob/master/torch/optim/adagrad.py
2
+ module Torch
3
+ module Optim
4
+ class Adagrad < Optimizer
5
+ def initialize(params, lr: 1e-2, lr_decay: 0, weight_decay: 0, initial_accumulator_value: 0, eps: 1e-10)
6
+ raise ArgumentError, "Invalid learning rate: #{lr}" if lr < 0
7
+ raise ArgumentError, "Invalid lr_decay value: #{lr_decay}" if lr_decay < 0
8
+ raise ArgumentError, "Invalid initial_accumulator_value value: #{initial_accumulator_value}" if initial_accumulator_value < 0
9
+ raise ArgumentError, "Invalid weight_decay value: #{weight_decay}" if weight_decay < 0
10
+ raise ArgumentError, "Invalid epsilon value: #{eps}" if eps < 0
11
+
12
+ defaults = {lr: lr, lr_decay: lr_decay, eps: eps, weight_decay: weight_decay, initial_accumulator_value: initial_accumulator_value}
13
+ super(params, defaults)
14
+
15
+ @param_groups.each do |group|
16
+ group[:params].each do |p|
17
+ state = @state[p]
18
+ state[:step] = 0
19
+ state[:sum] = Torch.full_like(p.data, initial_accumulator_value)
20
+ end
21
+ end
22
+ end
23
+
24
+ def share_memory
25
+ @param_groups.each do |group|
26
+ group[:params].each do |p|
27
+ state = @state[p]
28
+ state[:sum].share_memory!
29
+ end
30
+ end
31
+ end
32
+
33
+ def step(closure = nil)
34
+ loss = nil
35
+ if closure
36
+ loss = closure.call
37
+ end
38
+
39
+ @param_groups.each do |group|
40
+ group[:params].each do |p|
41
+ next unless p.grad
42
+
43
+ grad = p.grad.data
44
+ state = @state[p]
45
+
46
+ state[:step] += 1
47
+
48
+ if group[:weight_decay] != 0
49
+ if p.grad.data.sparse?
50
+ raise Error, "weight_decay option is not compatible with sparse gradients"
51
+ end
52
+ grad = grad.add(group[:weight_decay], p.data)
53
+ end
54
+
55
+ clr = group[:lr] / (1 + (state[:step] - 1) * group[:lr_decay])
56
+
57
+ if grad.sparse?
58
+ raise NotImplementedYet
59
+ else
60
+ state[:sum].addcmul!(1, grad, grad)
61
+ std = state[:sum].sqrt.add!(group[:eps])
62
+ p.data.addcdiv!(-clr, grad, std)
63
+ end
64
+ end
65
+ end
66
+
67
+ loss
68
+ end
69
+ end
70
+ end
71
+ end
@@ -0,0 +1,81 @@
1
+ # ported from https://github.com/pytorch/pytorch/blob/master/torch/optim/adam.py
2
+ module Torch
3
+ module Optim
4
+ class Adam < Optimizer
5
+ def initialize(params, lr: 1e-3, betas: [0.9, 0.999], eps: 1e-8, weight_decay: 0, amsgrad: false)
6
+ raise ArgumentError, "Invalid learning rate: #{lr}" if lr < 0
7
+ raise ArgumentError, "Invalid epsilon value: #{eps}" if eps < 0
8
+ raise ArgumentError, "Invalid beta parameter at index 0: #{betas[0]}" if betas[0] < 0 || betas[0] >= 1
9
+ raise ArgumentError, "Invalid beta parameter at index 1: #{betas[1]}" if betas[1] < 0 || betas[1] >= 1
10
+
11
+ defaults = {lr: lr, betas: betas, eps: eps, weight_decay: weight_decay, amsgrad: amsgrad}
12
+ super(params, defaults)
13
+ end
14
+
15
+ def step(closure = nil)
16
+ loss = nil
17
+ if closure
18
+ loss = closure.call
19
+ end
20
+
21
+ @param_groups.each do |group|
22
+ group[:params].each do |p|
23
+ next unless p.grad
24
+ grad = p.grad.data
25
+ if grad.sparse?
26
+ raise Error, "Adam does not support sparse gradients, please consider SparseAdam instead"
27
+ end
28
+ amsgrad = group[:amsgrad]
29
+
30
+ state = @state[p]
31
+
32
+ # State initialization
33
+ if state.size == 0
34
+ state[:step] = 0
35
+ # Exponential moving average of gradient values
36
+ state[:exp_avg] = Torch.zeros_like(p.data)
37
+ # Exponential moving average of squared gradient values
38
+ state[:exp_avg_sq] = Torch.zeros_like(p.data)
39
+ if amsgrad
40
+ # Maintains max of all exp. moving avg. of sq. grad. values
41
+ state[:max_exp_avg_sq] = Torch.zeros_like(p.data)
42
+ end
43
+ end
44
+
45
+ exp_avg, exp_avg_sq = state[:exp_avg], state[:exp_avg_sq]
46
+ if amsgrad
47
+ max_exp_avg_sq = state[:max_exp_avg_sq]
48
+ end
49
+ beta1, beta2 = group[:betas]
50
+
51
+ state[:step] += 1
52
+ bias_correction1 = 1 - beta1 ** state[:step]
53
+ bias_correction2 = 1 - beta2 ** state[:step]
54
+
55
+ if group[:weight_decay] != 0
56
+ grad.add!(group[:weight_decay], p.data)
57
+ end
58
+
59
+ # Decay the first and second moment running average coefficient
60
+ exp_avg.mul!(beta1).add!(1 - beta1, grad)
61
+ exp_avg_sq.mul!(beta2).addcmul!(1 - beta2, grad, grad)
62
+ if amsgrad
63
+ # Maintains the maximum of all 2nd moment running avg. till now
64
+ Torch.max(max_exp_avg_sq, exp_avg_sq, out: max_exp_avg_sq)
65
+ # Use the max. for normalizing running avg. of gradient
66
+ denom = (max_exp_avg_sq.sqrt / Math.sqrt(bias_correction2)).add!(group[:eps])
67
+ else
68
+ denom = (exp_avg_sq.sqrt / Math.sqrt(bias_correction2)).add!(group[:eps])
69
+ end
70
+
71
+ step_size = group[:lr] / bias_correction1
72
+
73
+ p.data.addcdiv!(-step_size, exp_avg, denom)
74
+ end
75
+ end
76
+
77
+ loss
78
+ end
79
+ end
80
+ end
81
+ end
@@ -0,0 +1,68 @@
1
+ # ported from https://github.com/pytorch/pytorch/blob/master/torch/optim/adamax.py
2
+ module Torch
3
+ module Optim
4
+ class Adamax < Optimizer
5
+ def initialize(params, lr: 2e-3, betas: [0.9, 0.999], eps: 1e-8, weight_decay: 0)
6
+ raise ArgumentError, "Invalid learning rate: #{lr}" if lr < 0
7
+ raise ArgumentError, "Invalid epsilon value: #{eps}" if eps < 0
8
+ raise ArgumentError, "Invalid beta parameter at index 0: #{betas[0]}" if betas[0] < 0 || betas[0] >= 1
9
+ raise ArgumentError, "Invalid beta parameter at index 1: #{betas[1]}" if betas[1] < 0 || betas[1] >= 1
10
+ raise ArgumentError, "Invalid weight_decay value: #{weight_decay}" if weight_decay < 0
11
+
12
+ defaults = {lr: lr, betas: betas, eps: eps, weight_decay: weight_decay}
13
+ super(params, defaults)
14
+ end
15
+
16
+ def step(closure = nil)
17
+ loss = nil
18
+ if closure
19
+ loss = closure.call
20
+ end
21
+
22
+ @param_groups.each do |group|
23
+ group[:params].each do |p|
24
+ next unless p.grad
25
+ grad = p.grad.data
26
+ if grad.sparse?
27
+ raise Error, "Adamax does not support sparse gradients, please consider SparseAdam instead"
28
+ end
29
+ state = @state[p]
30
+
31
+ # State initialization
32
+ if state.size == 0
33
+ state[:step] = 0
34
+ state[:exp_avg] = Torch.zeros_like(p.data)
35
+ state[:exp_inf] = Torch.zeros_like(p.data)
36
+ end
37
+
38
+ exp_avg, exp_inf = state[:exp_avg], state[:exp_inf]
39
+ beta1, beta2 = group[:betas]
40
+ eps = group[:eps]
41
+
42
+ state[:step] += 1
43
+
44
+ if group[:weight_decay] != 0
45
+ grad = grad.add(group[:weight_decay], p.data)
46
+ end
47
+
48
+ # Update biased first moment estimate.
49
+ exp_avg.mul!(beta1).add!(1 - beta1, grad)
50
+ # Update the exponentially weighted infinity norm.
51
+ norm_buf = Torch.cat([
52
+ exp_inf.mul!(beta2).unsqueeze(0),
53
+ grad.abs.add!(eps).unsqueeze!(0)
54
+ ], 0)
55
+ Torch.max(norm_buf, 0, keepdim: false, out: [exp_inf, exp_inf.new.long])
56
+
57
+ bias_correction = 1 - beta1 ** state[:step]
58
+ clr = group[:lr] / bias_correction
59
+
60
+ p.data.addcdiv!(-clr, exp_avg, exp_inf)
61
+ end
62
+ end
63
+
64
+ loss
65
+ end
66
+ end
67
+ end
68
+ end
@@ -0,0 +1,82 @@
1
+ # ported from https://github.com/pytorch/pytorch/blob/master/torch/optim/adamw.py
2
+ module Torch
3
+ module Optim
4
+ class AdamW < Optimizer
5
+ def initialize(params, lr: 1e-3, betas: [0.9, 0.999], eps: 1e-8, weight_decay: 1e-2, amsgrad: false)
6
+ raise ArgumentError, "Invalid learning rate: #{lr}" if lr < 0
7
+ raise ArgumentError, "Invalid epsilon value: #{eps}" if eps < 0
8
+ raise ArgumentError, "Invalid beta parameter at index 0: #{betas[0]}" if betas[0] < 0 || betas[0] >= 1
9
+ raise ArgumentError, "Invalid beta parameter at index 1: #{betas[1]}" if betas[1] < 0 || betas[1] >= 1
10
+
11
+ defaults = {lr: lr, betas: betas, eps: eps, weight_decay: weight_decay, amsgrad: amsgrad}
12
+ super(params, defaults)
13
+ end
14
+
15
+ def step(closure = nil)
16
+ loss = nil
17
+ if closure
18
+ loss = closure.call
19
+ end
20
+
21
+ @param_groups.each do |group|
22
+ group[:params].each do |p|
23
+ next unless p.grad
24
+
25
+ # Perform stepweight decay
26
+ p.data.mul!(1 - group[:lr] * group[:weight_decay])
27
+
28
+ # Perform optimization step
29
+ grad = p.grad.data
30
+ if grad.sparse?
31
+ raise Error, "AdamW does not support sparse gradients, please consider SparseAdam instead"
32
+ end
33
+ amsgrad = group[:amsgrad]
34
+
35
+ state = @state[p]
36
+
37
+ # State initialization
38
+ if state.size == 0
39
+ state[:step] = 0
40
+ # Exponential moving average of gradient values
41
+ state[:exp_avg] = Torch.zeros_like(p.data)
42
+ # Exponential moving average of squared gradient values
43
+ state[:exp_avg_sq] = Torch.zeros_like(p.data)
44
+ if amsgrad
45
+ # Maintains max of all exp. moving avg. of sq. grad. values
46
+ state[:max_exp_avg_sq] = Torch.zeros_like(p.data)
47
+ end
48
+ end
49
+
50
+ exp_avg, exp_avg_sq = state[:exp_avg], state[:exp_avg_sq]
51
+ if amsgrad
52
+ max_exp_avg_sq = state[:max_exp_avg_sq]
53
+ end
54
+ beta1, beta2 = group[:betas]
55
+
56
+ state[:step] += 1
57
+ bias_correction1 = 1 - beta1 ** state[:step]
58
+ bias_correction2 = 1 - beta2 ** state[:step]
59
+
60
+ # Decay the first and second moment running average coefficient
61
+ exp_avg.mul!(beta1).add!(1 - beta1, grad)
62
+ exp_avg_sq.mul!(beta2).addcmul!(1 - beta2, grad, grad)
63
+ if amsgrad
64
+ # Maintains the maximum of all 2nd moment running avg. till now
65
+ Torch.max(max_exp_avg_sq, exp_avg_sq, out: max_exp_avg_sq)
66
+ # Use the max. for normalizing running avg. of gradient
67
+ denom = (max_exp_avg_sq.sqrt / Math.sqrt(bias_correction2)).add!(group[:eps])
68
+ else
69
+ denom = (exp_avg_sq.sqrt / Math.sqrt(bias_correction2)).add!(group[:eps])
70
+ end
71
+
72
+ step_size = group[:lr] / bias_correction1
73
+
74
+ p.data.addcdiv!(-step_size, exp_avg, denom)
75
+ end
76
+ end
77
+
78
+ loss
79
+ end
80
+ end
81
+ end
82
+ end