torch-rb 0.1.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (44) hide show
  1. checksums.yaml +7 -0
  2. data/CHANGELOG.md +28 -0
  3. data/LICENSE.txt +46 -0
  4. data/README.md +426 -0
  5. data/ext/torch/ext.cpp +839 -0
  6. data/ext/torch/extconf.rb +25 -0
  7. data/lib/torch-rb.rb +1 -0
  8. data/lib/torch.rb +422 -0
  9. data/lib/torch/ext.bundle +0 -0
  10. data/lib/torch/inspector.rb +85 -0
  11. data/lib/torch/nn/alpha_dropout.rb +9 -0
  12. data/lib/torch/nn/conv2d.rb +37 -0
  13. data/lib/torch/nn/convnd.rb +41 -0
  14. data/lib/torch/nn/dropout.rb +9 -0
  15. data/lib/torch/nn/dropout2d.rb +9 -0
  16. data/lib/torch/nn/dropout3d.rb +9 -0
  17. data/lib/torch/nn/dropoutnd.rb +15 -0
  18. data/lib/torch/nn/embedding.rb +52 -0
  19. data/lib/torch/nn/feature_alpha_dropout.rb +9 -0
  20. data/lib/torch/nn/functional.rb +100 -0
  21. data/lib/torch/nn/init.rb +30 -0
  22. data/lib/torch/nn/linear.rb +36 -0
  23. data/lib/torch/nn/module.rb +85 -0
  24. data/lib/torch/nn/mse_loss.rb +13 -0
  25. data/lib/torch/nn/parameter.rb +14 -0
  26. data/lib/torch/nn/relu.rb +13 -0
  27. data/lib/torch/nn/sequential.rb +29 -0
  28. data/lib/torch/optim/adadelta.rb +57 -0
  29. data/lib/torch/optim/adagrad.rb +71 -0
  30. data/lib/torch/optim/adam.rb +81 -0
  31. data/lib/torch/optim/adamax.rb +68 -0
  32. data/lib/torch/optim/adamw.rb +82 -0
  33. data/lib/torch/optim/asgd.rb +65 -0
  34. data/lib/torch/optim/lr_scheduler/lr_scheduler.rb +33 -0
  35. data/lib/torch/optim/lr_scheduler/step_lr.rb +17 -0
  36. data/lib/torch/optim/optimizer.rb +62 -0
  37. data/lib/torch/optim/rmsprop.rb +76 -0
  38. data/lib/torch/optim/rprop.rb +68 -0
  39. data/lib/torch/optim/sgd.rb +60 -0
  40. data/lib/torch/tensor.rb +196 -0
  41. data/lib/torch/utils/data/data_loader.rb +27 -0
  42. data/lib/torch/utils/data/tensor_dataset.rb +22 -0
  43. data/lib/torch/version.rb +3 -0
  44. metadata +169 -0
@@ -0,0 +1,85 @@
1
+ module Torch
2
+ module NN
3
+ class Module
4
+ def initialize
5
+ @training = true
6
+ end
7
+
8
+ def inspect
9
+ str = String.new
10
+ str << "#{self.class.name}(\n"
11
+ modules.each do |name, mod|
12
+ str << " (#{name}): #{mod.inspect}\n"
13
+ end
14
+ str << ")"
15
+ end
16
+
17
+ def train(mode = true)
18
+ @training = mode
19
+
20
+ modules.each do |_, mod|
21
+ mod.train(mode)
22
+ end
23
+ end
24
+
25
+ def eval
26
+ train(false)
27
+ end
28
+
29
+ def call(*input)
30
+ forward(*input)
31
+ end
32
+
33
+ # modifies in-place
34
+ def to(device)
35
+ instance_variables.each do |name|
36
+ param = instance_variable_get(name)
37
+ if param.is_a?(Parameter)
38
+ instance_variable_set(name, Parameter.new(param.to(device)))
39
+ end
40
+ end
41
+ modules.each do |_, mod|
42
+ mod.to(device)
43
+ end
44
+ self
45
+ end
46
+
47
+ def parameters
48
+ params = []
49
+ instance_variables.each do |name|
50
+ param = instance_variable_get(name)
51
+ params << param if param.is_a?(Parameter)
52
+ end
53
+ params + modules.flat_map { |_, mod| mod.parameters }
54
+ end
55
+
56
+ def zero_grad
57
+ parameters.each do |param|
58
+ if param.grad
59
+ param.grad.detach!
60
+ param.grad.zero!
61
+ end
62
+ end
63
+ end
64
+
65
+ def method_missing(method, *args, &block)
66
+ modules[method.to_s] || super
67
+ end
68
+
69
+ def respond_to?(method, include_private = false)
70
+ modules.key?(method.to_s) || super
71
+ end
72
+
73
+ private
74
+
75
+ def modules
76
+ modules = {}
77
+ instance_variables.each do |name|
78
+ mod = instance_variable_get(name)
79
+ modules[name[1..-1]] = mod if mod.is_a?(Module)
80
+ end
81
+ modules
82
+ end
83
+ end
84
+ end
85
+ end
@@ -0,0 +1,13 @@
1
+ module Torch
2
+ module NN
3
+ class MSELoss < Module
4
+ def initialize(reduction: "mean")
5
+ @reduction = reduction
6
+ end
7
+
8
+ def forward(input, target)
9
+ F.mse_loss(input, target, reduction: @reduction)
10
+ end
11
+ end
12
+ end
13
+ end
@@ -0,0 +1,14 @@
1
+ module Torch
2
+ module NN
3
+ class Parameter < Tensor
4
+ def self.new(data = nil, requires_grad: true)
5
+ data = Tensor.new unless data
6
+ Tensor._make_subclass(data, requires_grad)
7
+ end
8
+
9
+ def grad
10
+ _grad if _grad_defined
11
+ end
12
+ end
13
+ end
14
+ end
@@ -0,0 +1,13 @@
1
+ module Torch
2
+ module NN
3
+ class ReLU < Module
4
+ def initialize #(inplace: false)
5
+ # @inplace = inplace
6
+ end
7
+
8
+ def forward(input)
9
+ F.relu(input) #, inplace: @inplace)
10
+ end
11
+ end
12
+ end
13
+ end
@@ -0,0 +1,29 @@
1
+ module Torch
2
+ module NN
3
+ class Sequential < Module
4
+ def initialize(*args)
5
+ @modules = {}
6
+ # TODO support hash arg (named modules)
7
+ args.each_with_index do |mod, idx|
8
+ add_module(idx.to_s, mod)
9
+ end
10
+ end
11
+
12
+ def add_module(name, mod)
13
+ # TODO add checks
14
+ @modules[name] = mod
15
+ end
16
+
17
+ def forward(input)
18
+ @modules.values.each do |mod|
19
+ input = mod.call(input)
20
+ end
21
+ input
22
+ end
23
+
24
+ def parameters
25
+ @modules.flat_map { |_, mod| mod.parameters }
26
+ end
27
+ end
28
+ end
29
+ end
@@ -0,0 +1,57 @@
1
+ # ported from https://github.com/pytorch/pytorch/blob/master/torch/optim/adadelta.py
2
+ module Torch
3
+ module Optim
4
+ class Adadelta < Optimizer
5
+ def initialize(params, lr: 1.0, rho: 0.9, eps: 1e-6, weight_decay: 0)
6
+ raise ArgumentError, "Invalid learning rate: #{lr}" if lr < 0
7
+ raise ArgumentError, "Invalid rho value: #{rho}" if rho < 0 || rho > 1
8
+ raise ArgumentError, "Invalid epsilon value: #{eps}" if eps < 0
9
+ raise ArgumentError, "Invalid weight_decay value: #{weight_decay}" if weight_decay < 0
10
+
11
+ defaults = {lr: lr, rho: rho, eps: eps, weight_decay: weight_decay}
12
+ super(params, defaults)
13
+ end
14
+
15
+ def step(closure = nil)
16
+ loss = nil
17
+ if closure
18
+ loss = closure.call
19
+ end
20
+
21
+ @param_groups.each do |group|
22
+ group[:params].each do |p|
23
+ next unless p.grad
24
+ grad = p.grad.data
25
+ if grad.sparse?
26
+ raise Error, "Adadelta does not support sparse gradients"
27
+ end
28
+ state = @state[p]
29
+
30
+ if state.size == 0
31
+ state[:step] = 0
32
+ state[:square_avg] = Torch.zeros_like(p.data)
33
+ state[:acc_delta] = Torch.zeros_like(p.data)
34
+ end
35
+
36
+ square_avg, acc_delta = state[:square_avg], state[:acc_delta]
37
+ rho, eps = group[:rho], group[:eps]
38
+
39
+ state[:step] += 1
40
+
41
+ if group[:weight_decay] != 0
42
+ grad = grad.add(group[:weight_decay], p.data)
43
+ end
44
+
45
+ square_avg.mul!(rho).addcmul!(1 - rho, grad, grad)
46
+ std = square_avg.add(eps).sqrt!
47
+ delta = acc_delta.add(eps).sqrt!.div!(std).mul!(grad)
48
+ p.data.add!(-group[:lr], delta)
49
+ acc_delta.mul!(rho).addcmul!(1 - rho, delta, delta)
50
+ end
51
+ end
52
+
53
+ loss
54
+ end
55
+ end
56
+ end
57
+ end
@@ -0,0 +1,71 @@
1
+ # ported from https://github.com/pytorch/pytorch/blob/master/torch/optim/adagrad.py
2
+ module Torch
3
+ module Optim
4
+ class Adagrad < Optimizer
5
+ def initialize(params, lr: 1e-2, lr_decay: 0, weight_decay: 0, initial_accumulator_value: 0, eps: 1e-10)
6
+ raise ArgumentError, "Invalid learning rate: #{lr}" if lr < 0
7
+ raise ArgumentError, "Invalid lr_decay value: #{lr_decay}" if lr_decay < 0
8
+ raise ArgumentError, "Invalid initial_accumulator_value value: #{initial_accumulator_value}" if initial_accumulator_value < 0
9
+ raise ArgumentError, "Invalid weight_decay value: #{weight_decay}" if weight_decay < 0
10
+ raise ArgumentError, "Invalid epsilon value: #{eps}" if eps < 0
11
+
12
+ defaults = {lr: lr, lr_decay: lr_decay, eps: eps, weight_decay: weight_decay, initial_accumulator_value: initial_accumulator_value}
13
+ super(params, defaults)
14
+
15
+ @param_groups.each do |group|
16
+ group[:params].each do |p|
17
+ state = @state[p]
18
+ state[:step] = 0
19
+ state[:sum] = Torch.full_like(p.data, initial_accumulator_value)
20
+ end
21
+ end
22
+ end
23
+
24
+ def share_memory
25
+ @param_groups.each do |group|
26
+ group[:params].each do |p|
27
+ state = @state[p]
28
+ state[:sum].share_memory!
29
+ end
30
+ end
31
+ end
32
+
33
+ def step(closure = nil)
34
+ loss = nil
35
+ if closure
36
+ loss = closure.call
37
+ end
38
+
39
+ @param_groups.each do |group|
40
+ group[:params].each do |p|
41
+ next unless p.grad
42
+
43
+ grad = p.grad.data
44
+ state = @state[p]
45
+
46
+ state[:step] += 1
47
+
48
+ if group[:weight_decay] != 0
49
+ if p.grad.data.sparse?
50
+ raise Error, "weight_decay option is not compatible with sparse gradients"
51
+ end
52
+ grad = grad.add(group[:weight_decay], p.data)
53
+ end
54
+
55
+ clr = group[:lr] / (1 + (state[:step] - 1) * group[:lr_decay])
56
+
57
+ if grad.sparse?
58
+ raise NotImplementedYet
59
+ else
60
+ state[:sum].addcmul!(1, grad, grad)
61
+ std = state[:sum].sqrt.add!(group[:eps])
62
+ p.data.addcdiv!(-clr, grad, std)
63
+ end
64
+ end
65
+ end
66
+
67
+ loss
68
+ end
69
+ end
70
+ end
71
+ end
@@ -0,0 +1,81 @@
1
+ # ported from https://github.com/pytorch/pytorch/blob/master/torch/optim/adam.py
2
+ module Torch
3
+ module Optim
4
+ class Adam < Optimizer
5
+ def initialize(params, lr: 1e-3, betas: [0.9, 0.999], eps: 1e-8, weight_decay: 0, amsgrad: false)
6
+ raise ArgumentError, "Invalid learning rate: #{lr}" if lr < 0
7
+ raise ArgumentError, "Invalid epsilon value: #{eps}" if eps < 0
8
+ raise ArgumentError, "Invalid beta parameter at index 0: #{betas[0]}" if betas[0] < 0 || betas[0] >= 1
9
+ raise ArgumentError, "Invalid beta parameter at index 1: #{betas[1]}" if betas[1] < 0 || betas[1] >= 1
10
+
11
+ defaults = {lr: lr, betas: betas, eps: eps, weight_decay: weight_decay, amsgrad: amsgrad}
12
+ super(params, defaults)
13
+ end
14
+
15
+ def step(closure = nil)
16
+ loss = nil
17
+ if closure
18
+ loss = closure.call
19
+ end
20
+
21
+ @param_groups.each do |group|
22
+ group[:params].each do |p|
23
+ next unless p.grad
24
+ grad = p.grad.data
25
+ if grad.sparse?
26
+ raise Error, "Adam does not support sparse gradients, please consider SparseAdam instead"
27
+ end
28
+ amsgrad = group[:amsgrad]
29
+
30
+ state = @state[p]
31
+
32
+ # State initialization
33
+ if state.size == 0
34
+ state[:step] = 0
35
+ # Exponential moving average of gradient values
36
+ state[:exp_avg] = Torch.zeros_like(p.data)
37
+ # Exponential moving average of squared gradient values
38
+ state[:exp_avg_sq] = Torch.zeros_like(p.data)
39
+ if amsgrad
40
+ # Maintains max of all exp. moving avg. of sq. grad. values
41
+ state[:max_exp_avg_sq] = Torch.zeros_like(p.data)
42
+ end
43
+ end
44
+
45
+ exp_avg, exp_avg_sq = state[:exp_avg], state[:exp_avg_sq]
46
+ if amsgrad
47
+ max_exp_avg_sq = state[:max_exp_avg_sq]
48
+ end
49
+ beta1, beta2 = group[:betas]
50
+
51
+ state[:step] += 1
52
+ bias_correction1 = 1 - beta1 ** state[:step]
53
+ bias_correction2 = 1 - beta2 ** state[:step]
54
+
55
+ if group[:weight_decay] != 0
56
+ grad.add!(group[:weight_decay], p.data)
57
+ end
58
+
59
+ # Decay the first and second moment running average coefficient
60
+ exp_avg.mul!(beta1).add!(1 - beta1, grad)
61
+ exp_avg_sq.mul!(beta2).addcmul!(1 - beta2, grad, grad)
62
+ if amsgrad
63
+ # Maintains the maximum of all 2nd moment running avg. till now
64
+ Torch.max(max_exp_avg_sq, exp_avg_sq, out: max_exp_avg_sq)
65
+ # Use the max. for normalizing running avg. of gradient
66
+ denom = (max_exp_avg_sq.sqrt / Math.sqrt(bias_correction2)).add!(group[:eps])
67
+ else
68
+ denom = (exp_avg_sq.sqrt / Math.sqrt(bias_correction2)).add!(group[:eps])
69
+ end
70
+
71
+ step_size = group[:lr] / bias_correction1
72
+
73
+ p.data.addcdiv!(-step_size, exp_avg, denom)
74
+ end
75
+ end
76
+
77
+ loss
78
+ end
79
+ end
80
+ end
81
+ end
@@ -0,0 +1,68 @@
1
+ # ported from https://github.com/pytorch/pytorch/blob/master/torch/optim/adamax.py
2
+ module Torch
3
+ module Optim
4
+ class Adamax < Optimizer
5
+ def initialize(params, lr: 2e-3, betas: [0.9, 0.999], eps: 1e-8, weight_decay: 0)
6
+ raise ArgumentError, "Invalid learning rate: #{lr}" if lr < 0
7
+ raise ArgumentError, "Invalid epsilon value: #{eps}" if eps < 0
8
+ raise ArgumentError, "Invalid beta parameter at index 0: #{betas[0]}" if betas[0] < 0 || betas[0] >= 1
9
+ raise ArgumentError, "Invalid beta parameter at index 1: #{betas[1]}" if betas[1] < 0 || betas[1] >= 1
10
+ raise ArgumentError, "Invalid weight_decay value: #{weight_decay}" if weight_decay < 0
11
+
12
+ defaults = {lr: lr, betas: betas, eps: eps, weight_decay: weight_decay}
13
+ super(params, defaults)
14
+ end
15
+
16
+ def step(closure = nil)
17
+ loss = nil
18
+ if closure
19
+ loss = closure.call
20
+ end
21
+
22
+ @param_groups.each do |group|
23
+ group[:params].each do |p|
24
+ next unless p.grad
25
+ grad = p.grad.data
26
+ if grad.sparse?
27
+ raise Error, "Adamax does not support sparse gradients, please consider SparseAdam instead"
28
+ end
29
+ state = @state[p]
30
+
31
+ # State initialization
32
+ if state.size == 0
33
+ state[:step] = 0
34
+ state[:exp_avg] = Torch.zeros_like(p.data)
35
+ state[:exp_inf] = Torch.zeros_like(p.data)
36
+ end
37
+
38
+ exp_avg, exp_inf = state[:exp_avg], state[:exp_inf]
39
+ beta1, beta2 = group[:betas]
40
+ eps = group[:eps]
41
+
42
+ state[:step] += 1
43
+
44
+ if group[:weight_decay] != 0
45
+ grad = grad.add(group[:weight_decay], p.data)
46
+ end
47
+
48
+ # Update biased first moment estimate.
49
+ exp_avg.mul!(beta1).add!(1 - beta1, grad)
50
+ # Update the exponentially weighted infinity norm.
51
+ norm_buf = Torch.cat([
52
+ exp_inf.mul!(beta2).unsqueeze(0),
53
+ grad.abs.add!(eps).unsqueeze!(0)
54
+ ], 0)
55
+ Torch.max(norm_buf, 0, keepdim: false, out: [exp_inf, exp_inf.new.long])
56
+
57
+ bias_correction = 1 - beta1 ** state[:step]
58
+ clr = group[:lr] / bias_correction
59
+
60
+ p.data.addcdiv!(-clr, exp_avg, exp_inf)
61
+ end
62
+ end
63
+
64
+ loss
65
+ end
66
+ end
67
+ end
68
+ end
@@ -0,0 +1,82 @@
1
+ # ported from https://github.com/pytorch/pytorch/blob/master/torch/optim/adamw.py
2
+ module Torch
3
+ module Optim
4
+ class AdamW < Optimizer
5
+ def initialize(params, lr: 1e-3, betas: [0.9, 0.999], eps: 1e-8, weight_decay: 1e-2, amsgrad: false)
6
+ raise ArgumentError, "Invalid learning rate: #{lr}" if lr < 0
7
+ raise ArgumentError, "Invalid epsilon value: #{eps}" if eps < 0
8
+ raise ArgumentError, "Invalid beta parameter at index 0: #{betas[0]}" if betas[0] < 0 || betas[0] >= 1
9
+ raise ArgumentError, "Invalid beta parameter at index 1: #{betas[1]}" if betas[1] < 0 || betas[1] >= 1
10
+
11
+ defaults = {lr: lr, betas: betas, eps: eps, weight_decay: weight_decay, amsgrad: amsgrad}
12
+ super(params, defaults)
13
+ end
14
+
15
+ def step(closure = nil)
16
+ loss = nil
17
+ if closure
18
+ loss = closure.call
19
+ end
20
+
21
+ @param_groups.each do |group|
22
+ group[:params].each do |p|
23
+ next unless p.grad
24
+
25
+ # Perform stepweight decay
26
+ p.data.mul!(1 - group[:lr] * group[:weight_decay])
27
+
28
+ # Perform optimization step
29
+ grad = p.grad.data
30
+ if grad.sparse?
31
+ raise Error, "AdamW does not support sparse gradients, please consider SparseAdam instead"
32
+ end
33
+ amsgrad = group[:amsgrad]
34
+
35
+ state = @state[p]
36
+
37
+ # State initialization
38
+ if state.size == 0
39
+ state[:step] = 0
40
+ # Exponential moving average of gradient values
41
+ state[:exp_avg] = Torch.zeros_like(p.data)
42
+ # Exponential moving average of squared gradient values
43
+ state[:exp_avg_sq] = Torch.zeros_like(p.data)
44
+ if amsgrad
45
+ # Maintains max of all exp. moving avg. of sq. grad. values
46
+ state[:max_exp_avg_sq] = Torch.zeros_like(p.data)
47
+ end
48
+ end
49
+
50
+ exp_avg, exp_avg_sq = state[:exp_avg], state[:exp_avg_sq]
51
+ if amsgrad
52
+ max_exp_avg_sq = state[:max_exp_avg_sq]
53
+ end
54
+ beta1, beta2 = group[:betas]
55
+
56
+ state[:step] += 1
57
+ bias_correction1 = 1 - beta1 ** state[:step]
58
+ bias_correction2 = 1 - beta2 ** state[:step]
59
+
60
+ # Decay the first and second moment running average coefficient
61
+ exp_avg.mul!(beta1).add!(1 - beta1, grad)
62
+ exp_avg_sq.mul!(beta2).addcmul!(1 - beta2, grad, grad)
63
+ if amsgrad
64
+ # Maintains the maximum of all 2nd moment running avg. till now
65
+ Torch.max(max_exp_avg_sq, exp_avg_sq, out: max_exp_avg_sq)
66
+ # Use the max. for normalizing running avg. of gradient
67
+ denom = (max_exp_avg_sq.sqrt / Math.sqrt(bias_correction2)).add!(group[:eps])
68
+ else
69
+ denom = (exp_avg_sq.sqrt / Math.sqrt(bias_correction2)).add!(group[:eps])
70
+ end
71
+
72
+ step_size = group[:lr] / bias_correction1
73
+
74
+ p.data.addcdiv!(-step_size, exp_avg, denom)
75
+ end
76
+ end
77
+
78
+ loss
79
+ end
80
+ end
81
+ end
82
+ end