tensor_stream-opencl 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (72) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +11 -0
  3. data/.rspec +3 -0
  4. data/.travis.yml +5 -0
  5. data/CODE_OF_CONDUCT.md +74 -0
  6. data/Gemfile +6 -0
  7. data/Gemfile.lock +51 -0
  8. data/LICENSE.txt +21 -0
  9. data/README.md +58 -0
  10. data/Rakefile +6 -0
  11. data/bin/console +14 -0
  12. data/bin/setup +8 -0
  13. data/lib/tensor_stream/opencl.rb +7 -0
  14. data/lib/tensor_stream/opencl/kernels/_bool_operand.cl +45 -0
  15. data/lib/tensor_stream/opencl/kernels/_operand.cl +45 -0
  16. data/lib/tensor_stream/opencl/kernels/abs.cl +20 -0
  17. data/lib/tensor_stream/opencl/kernels/acos.cl +8 -0
  18. data/lib/tensor_stream/opencl/kernels/add.cl +3 -0
  19. data/lib/tensor_stream/opencl/kernels/apply_adam.cl +23 -0
  20. data/lib/tensor_stream/opencl/kernels/apply_gradient.cl +9 -0
  21. data/lib/tensor_stream/opencl/kernels/apply_momentum.cl +16 -0
  22. data/lib/tensor_stream/opencl/kernels/argmax.cl +8 -0
  23. data/lib/tensor_stream/opencl/kernels/argmin.cl +8 -0
  24. data/lib/tensor_stream/opencl/kernels/asin.cl +9 -0
  25. data/lib/tensor_stream/opencl/kernels/cast.cl +10 -0
  26. data/lib/tensor_stream/opencl/kernels/ceil.cl +8 -0
  27. data/lib/tensor_stream/opencl/kernels/cond.cl.erb +6 -0
  28. data/lib/tensor_stream/opencl/kernels/cos.cl +8 -0
  29. data/lib/tensor_stream/opencl/kernels/div.cl.erb +3 -0
  30. data/lib/tensor_stream/opencl/kernels/exp.cl +8 -0
  31. data/lib/tensor_stream/opencl/kernels/floor.cl +8 -0
  32. data/lib/tensor_stream/opencl/kernels/floor_div.cl +48 -0
  33. data/lib/tensor_stream/opencl/kernels/floor_mod.cl +3 -0
  34. data/lib/tensor_stream/opencl/kernels/gemm.cl +32 -0
  35. data/lib/tensor_stream/opencl/kernels/log.cl +8 -0
  36. data/lib/tensor_stream/opencl/kernels/log1p.cl +8 -0
  37. data/lib/tensor_stream/opencl/kernels/log_softmax.cl +26 -0
  38. data/lib/tensor_stream/opencl/kernels/max.cl +46 -0
  39. data/lib/tensor_stream/opencl/kernels/min.cl +46 -0
  40. data/lib/tensor_stream/opencl/kernels/mod.cl +3 -0
  41. data/lib/tensor_stream/opencl/kernels/mul.cl +3 -0
  42. data/lib/tensor_stream/opencl/kernels/negate.cl +8 -0
  43. data/lib/tensor_stream/opencl/kernels/pack.cl +24 -0
  44. data/lib/tensor_stream/opencl/kernels/pow.cl +46 -0
  45. data/lib/tensor_stream/opencl/kernels/real_div.cl +3 -0
  46. data/lib/tensor_stream/opencl/kernels/reciprocal.cl +8 -0
  47. data/lib/tensor_stream/opencl/kernels/round.cl +8 -0
  48. data/lib/tensor_stream/opencl/kernels/sigmoid.cl +9 -0
  49. data/lib/tensor_stream/opencl/kernels/sigmoid_grad.cl +55 -0
  50. data/lib/tensor_stream/opencl/kernels/sign.cl +21 -0
  51. data/lib/tensor_stream/opencl/kernels/sin.cl +9 -0
  52. data/lib/tensor_stream/opencl/kernels/softmax.cl +26 -0
  53. data/lib/tensor_stream/opencl/kernels/softmax_cross.cl +32 -0
  54. data/lib/tensor_stream/opencl/kernels/softmax_cross_grad.cl +28 -0
  55. data/lib/tensor_stream/opencl/kernels/softmax_grad.cl +46 -0
  56. data/lib/tensor_stream/opencl/kernels/sqrt.cl +9 -0
  57. data/lib/tensor_stream/opencl/kernels/square.cl +9 -0
  58. data/lib/tensor_stream/opencl/kernels/squared_difference.cl +53 -0
  59. data/lib/tensor_stream/opencl/kernels/sub.cl +3 -0
  60. data/lib/tensor_stream/opencl/kernels/tan.cl +8 -0
  61. data/lib/tensor_stream/opencl/kernels/tanh.cl +8 -0
  62. data/lib/tensor_stream/opencl/kernels/tanh_grad.cl +7 -0
  63. data/lib/tensor_stream/opencl/kernels/where.cl +8 -0
  64. data/lib/tensor_stream/opencl/math_ops.rb +133 -0
  65. data/lib/tensor_stream/opencl/nn_ops.rb +191 -0
  66. data/lib/tensor_stream/opencl/opencl_buffer.rb +35 -0
  67. data/lib/tensor_stream/opencl/opencl_device.rb +5 -0
  68. data/lib/tensor_stream/opencl/opencl_evaluator.rb +933 -0
  69. data/lib/tensor_stream/opencl/opencl_template_helper.rb +99 -0
  70. data/lib/tensor_stream/opencl/version.rb +5 -0
  71. data/tensor_stream-opencl.gemspec +40 -0
  72. metadata +185 -0
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: 0d1536e4ed46b18be21e6c0a12a6c06a854d6695
4
+ data.tar.gz: 458e8fcf33d992c3e0ee3660bf632ef5bab49b8e
5
+ SHA512:
6
+ metadata.gz: 9841051b9a4ef0809eb243158aeb1bfa19e8efeb81f30e99795a019e86b08ff65b02b0bd0c64e4d31350bc496a6371298e945d77f52456439cfd419b02fef2ce
7
+ data.tar.gz: 29d8efc7a82aac8ecd959a46d04e02c96efeca311ef9340fd74be6bbe1b89a62d8789a2cc867c1d62c2860dffcf03c203bd825ebe159fc0be4db3ec457eb2ceb
data/.gitignore ADDED
@@ -0,0 +1,11 @@
1
+ /.bundle/
2
+ /.yardoc
3
+ /_yardoc/
4
+ /coverage/
5
+ /doc/
6
+ /pkg/
7
+ /spec/reports/
8
+ /tmp/
9
+
10
+ # rspec failure tracking
11
+ .rspec_status
data/.rspec ADDED
@@ -0,0 +1,3 @@
1
+ --format documentation
2
+ --color
3
+ --require spec_helper
data/.travis.yml ADDED
@@ -0,0 +1,5 @@
1
+ sudo: false
2
+ language: ruby
3
+ rvm:
4
+ - 2.5.1
5
+ before_install: gem install bundler -v 1.16.2
@@ -0,0 +1,74 @@
1
+ # Contributor Covenant Code of Conduct
2
+
3
+ ## Our Pledge
4
+
5
+ In the interest of fostering an open and welcoming environment, we as
6
+ contributors and maintainers pledge to making participation in our project and
7
+ our community a harassment-free experience for everyone, regardless of age, body
8
+ size, disability, ethnicity, gender identity and expression, level of experience,
9
+ nationality, personal appearance, race, religion, or sexual identity and
10
+ orientation.
11
+
12
+ ## Our Standards
13
+
14
+ Examples of behavior that contributes to creating a positive environment
15
+ include:
16
+
17
+ * Using welcoming and inclusive language
18
+ * Being respectful of differing viewpoints and experiences
19
+ * Gracefully accepting constructive criticism
20
+ * Focusing on what is best for the community
21
+ * Showing empathy towards other community members
22
+
23
+ Examples of unacceptable behavior by participants include:
24
+
25
+ * The use of sexualized language or imagery and unwelcome sexual attention or
26
+ advances
27
+ * Trolling, insulting/derogatory comments, and personal or political attacks
28
+ * Public or private harassment
29
+ * Publishing others' private information, such as a physical or electronic
30
+ address, without explicit permission
31
+ * Other conduct which could reasonably be considered inappropriate in a
32
+ professional setting
33
+
34
+ ## Our Responsibilities
35
+
36
+ Project maintainers are responsible for clarifying the standards of acceptable
37
+ behavior and are expected to take appropriate and fair corrective action in
38
+ response to any instances of unacceptable behavior.
39
+
40
+ Project maintainers have the right and responsibility to remove, edit, or
41
+ reject comments, commits, code, wiki edits, issues, and other contributions
42
+ that are not aligned to this Code of Conduct, or to ban temporarily or
43
+ permanently any contributor for other behaviors that they deem inappropriate,
44
+ threatening, offensive, or harmful.
45
+
46
+ ## Scope
47
+
48
+ This Code of Conduct applies both within project spaces and in public spaces
49
+ when an individual is representing the project or its community. Examples of
50
+ representing a project or community include using an official project e-mail
51
+ address, posting via an official social media account, or acting as an appointed
52
+ representative at an online or offline event. Representation of a project may be
53
+ further defined and clarified by project maintainers.
54
+
55
+ ## Enforcement
56
+
57
+ Instances of abusive, harassing, or otherwise unacceptable behavior may be
58
+ reported by contacting the project team at joseph.dayo@gmail.com. All
59
+ complaints will be reviewed and investigated and will result in a response that
60
+ is deemed necessary and appropriate to the circumstances. The project team is
61
+ obligated to maintain confidentiality with regard to the reporter of an incident.
62
+ Further details of specific enforcement policies may be posted separately.
63
+
64
+ Project maintainers who do not follow or enforce the Code of Conduct in good
65
+ faith may face temporary or permanent repercussions as determined by other
66
+ members of the project's leadership.
67
+
68
+ ## Attribution
69
+
70
+ This Code of Conduct is adapted from the [Contributor Covenant][homepage], version 1.4,
71
+ available at [http://contributor-covenant.org/version/1/4][version]
72
+
73
+ [homepage]: http://contributor-covenant.org
74
+ [version]: http://contributor-covenant.org/version/1/4/
data/Gemfile ADDED
@@ -0,0 +1,6 @@
1
+ source "https://rubygems.org"
2
+
3
+ git_source(:github) {|repo_name| "https://github.com/#{repo_name}" }
4
+
5
+ # Specify your gem's dependencies in tensor_stream-opencl.gemspec
6
+ gemspec
data/Gemfile.lock ADDED
@@ -0,0 +1,51 @@
1
+ PATH
2
+ remote: .
3
+ specs:
4
+ tensor_stream-opencl (0.1.0)
5
+ opencl_ruby_ffi
6
+ tensor_stream
7
+
8
+ GEM
9
+ remote: https://rubygems.org/
10
+ specs:
11
+ concurrent-ruby (1.0.5)
12
+ deep_merge (1.2.1)
13
+ diff-lcs (1.3)
14
+ ffi (1.9.25)
15
+ narray (0.6.1.2)
16
+ narray_ffi (1.4.3)
17
+ ffi (~> 1.9, >= 1.9.3)
18
+ narray (~> 0.6, >= 0.6.0.8)
19
+ opencl_ruby_ffi (1.3.4)
20
+ ffi (~> 1.9, >= 1.9.3)
21
+ narray (~> 0.6, >= 0.6.0.8)
22
+ narray_ffi (~> 1.0, >= 1.0.0)
23
+ rake (10.5.0)
24
+ rspec (3.8.0)
25
+ rspec-core (~> 3.8.0)
26
+ rspec-expectations (~> 3.8.0)
27
+ rspec-mocks (~> 3.8.0)
28
+ rspec-core (3.8.0)
29
+ rspec-support (~> 3.8.0)
30
+ rspec-expectations (3.8.1)
31
+ diff-lcs (>= 1.2.0, < 2.0)
32
+ rspec-support (~> 3.8.0)
33
+ rspec-mocks (3.8.0)
34
+ diff-lcs (>= 1.2.0, < 2.0)
35
+ rspec-support (~> 3.8.0)
36
+ rspec-support (3.8.0)
37
+ tensor_stream (0.8.1)
38
+ concurrent-ruby
39
+ deep_merge
40
+
41
+ PLATFORMS
42
+ ruby
43
+
44
+ DEPENDENCIES
45
+ bundler (~> 1.16)
46
+ rake (~> 10.0)
47
+ rspec (~> 3.0)
48
+ tensor_stream-opencl!
49
+
50
+ BUNDLED WITH
51
+ 1.16.2
data/LICENSE.txt ADDED
@@ -0,0 +1,21 @@
1
+ The MIT License (MIT)
2
+
3
+ Copyright (c) 2018 Joseph Dayo
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in
13
+ all copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
21
+ THE SOFTWARE.
data/README.md ADDED
@@ -0,0 +1,58 @@
1
+ # TensorStream::Opencl
2
+
3
+ This gem provides an OpenCL backend for TensorStream (https://github.com/jedld/tensor_stream). OpenCL is an open standard
4
+ that allows running compute applications on heterogenous platforms like CPUs and GPUs.
5
+
6
+ ## Installation
7
+
8
+ Make sure OpenCL device drivers are installed in your system. You may refer to the following links:
9
+
10
+ ### Nvidia
11
+
12
+ https://developer.nvidia.com/opencl
13
+
14
+ ### AMD
15
+
16
+ https://support.amd.com/en-us/kb-articles/Pages/OpenCL2-Driver.aspx
17
+
18
+
19
+ ### Intel
20
+
21
+ https://software.intel.com/en-us/articles/opencl-drivers
22
+
23
+
24
+ Add this line to your application's Gemfile:
25
+
26
+ ```ruby
27
+ gem 'tensor_stream-opencl'
28
+ ```
29
+
30
+ And then execute:
31
+
32
+ $ bundle
33
+
34
+ Or install it yourself as:
35
+
36
+ $ gem install tensor_stream-opencl
37
+
38
+ ## Usage
39
+
40
+ Simply including this gem will allow tensor_stream to automatically select opencl devices for use in your computation
41
+
42
+ ## Development
43
+
44
+ After checking out the repo, run `bin/setup` to install dependencies. Then, run `rake spec` to run the tests. You can also run `bin/console` for an interactive prompt that will allow you to experiment.
45
+
46
+ To install this gem onto your local machine, run `bundle exec rake install`. To release a new version, update the version number in `version.rb`, and then run `bundle exec rake release`, which will create a git tag for the version, push git commits and tags, and push the `.gem` file to [rubygems.org](https://rubygems.org).
47
+
48
+ ## Contributing
49
+
50
+ Bug reports and pull requests are welcome on GitHub at https://github.com/jedld/tensor_stream-opencl. This project is intended to be a safe, welcoming space for collaboration, and contributors are expected to adhere to the [Contributor Covenant](http://contributor-covenant.org) code of conduct.
51
+
52
+ ## License
53
+
54
+ The gem is available as open source under the terms of the [MIT License](https://opensource.org/licenses/MIT).
55
+
56
+ ## Code of Conduct
57
+
58
+ Everyone interacting in the TensorStream::Opencl project’s codebases, issue trackers, chat rooms and mailing lists is expected to follow the [code of conduct](https://github.com/[USERNAME]/tensor_stream-opencl/blob/master/CODE_OF_CONDUCT.md).
data/Rakefile ADDED
@@ -0,0 +1,6 @@
1
+ require "bundler/gem_tasks"
2
+ require "rspec/core/rake_task"
3
+
4
+ RSpec::Core::RakeTask.new(:spec)
5
+
6
+ task :default => :spec
data/bin/console ADDED
@@ -0,0 +1,14 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require "bundler/setup"
4
+ require "tensor_stream/opencl"
5
+
6
+ # You can add fixtures and/or initialization code here to make experimenting
7
+ # with your gem easier. You can also use a different console, if you like.
8
+
9
+ # (If you use this, don't forget to add pry to your Gemfile!)
10
+ # require "pry"
11
+ # Pry.start
12
+
13
+ require "irb"
14
+ IRB.start(__FILE__)
data/bin/setup ADDED
@@ -0,0 +1,8 @@
1
+ #!/usr/bin/env bash
2
+ set -euo pipefail
3
+ IFS=$'\n\t'
4
+ set -vx
5
+
6
+ bundle install
7
+
8
+ # Do any other automated setup that you need to do here
@@ -0,0 +1,7 @@
1
+ require "tensor_stream/opencl/version"
2
+ require "tensor_stream/opencl/opencl_evaluator"
3
+
4
+ module TensorStream
5
+ module Opencl
6
+ end
7
+ end
@@ -0,0 +1,45 @@
1
+ // same dimension add floating point op
2
+ __kernel void <%= fname%>_<%= dtype %>(const int M, const int N, const int switch_op, __global const <%= a_dtype %> *A, __global <%= b_dtype %> *B, __global <%= result_t %> *C) {
3
+ // Get the index of the current element to be processed
4
+ const int globalRow = get_global_id(0); // Row ID of C (0..M)
5
+ const int globalCol = get_global_id(1); // Col ID of C (0..N)
6
+
7
+ C[globalRow * N + globalCol] = A[globalRow * N + globalCol] <%= op %> B[globalRow * N + globalCol] ? 1 : 0;
8
+ }
9
+
10
+ // 1D + Scalar floating point add op
11
+ __kernel void <%=fname%>_c_<%= dtype %>(const int M, const int N, const int switch_op, __global const <%= a_dtype %> *A, __global <%= b_dtype %> *B, __global <%= result_t %> *C) {
12
+ // Get the index of the current element to be processed
13
+ const int globalRow = get_global_id(0); // Row ID of C (0..M)
14
+ const int globalCol = get_global_id(1); // Col ID of C (0..N)
15
+
16
+ if (switch_op == 0) {
17
+ C[globalRow * N + globalCol] = A[globalRow * N + globalCol] <%= op %> B[0] ? 1 : 0;
18
+ } else {
19
+ C[globalRow * N + globalCol] = B[0] <%= op %> A[globalRow * N + globalCol] ? 1 : 0;
20
+ }
21
+ }
22
+
23
+ // 1D + Scalar floating point add op broadcast
24
+ __kernel void <%= fname%>_b_<%= dtype %>(const int M, const int N, const int M2, const int N2, const int switch_op,__global const <%= a_dtype %> *A, __global <%= b_dtype %> *B, __global <%= result_t %> *C) {
25
+ // Get the index of the current element to be processed
26
+ const int globalRow = get_global_id(0); // Row ID of C (0..M)
27
+ const int globalCol = get_global_id(1); // Col ID of C (0..N)
28
+
29
+ int b_m_index = globalRow;
30
+ int b_n_index = globalCol;
31
+
32
+ if ( b_m_index >= M2) {
33
+ b_m_index = b_m_index % M2;
34
+ };
35
+
36
+ if (b_n_index >= N2) {
37
+ b_n_index = b_n_index % N2;
38
+ }
39
+
40
+ if (switch_op == 0) {
41
+ C[globalRow * N + globalCol] = A[globalRow * N + globalCol] <%= op %> B[b_m_index * N2 + b_n_index] ? 1 : 0;
42
+ } else {
43
+ C[globalRow * N + globalCol] = B[b_m_index * N2 + b_n_index] <%= op %> A[globalRow * N + globalCol] ? 1 : 0;
44
+ }
45
+ }
@@ -0,0 +1,45 @@
1
+ // same dimension add floating point op
2
+ __kernel void <%= fname%>_<%= dtype %>(const int M, const int N, const int switch_op, __global const <%= c_dtype %> *A, __global <%= c_dtype %> *B, __global <%= result_t %> *C) {
3
+ // Get the index of the current element to be processed
4
+ const int globalRow = get_global_id(0); // Row ID of C (0..M)
5
+ const int globalCol = get_global_id(1); // Col ID of C (0..N)
6
+
7
+ C[globalRow * N + globalCol] = A[globalRow * N + globalCol] <%= op %> B[globalRow * N + globalCol];
8
+ }
9
+
10
+ // 1D + Scalar floating point add op
11
+ __kernel void <%=fname%>_c_<%= dtype %>(const int M, const int N, const int switch_op, __global const <%= c_dtype %> *A, __global <%= c_dtype %> *B, __global <%= result_t %> *C) {
12
+ // Get the index of the current element to be processed
13
+ const int globalRow = get_global_id(0); // Row ID of C (0..M)
14
+ const int globalCol = get_global_id(1); // Col ID of C (0..N)
15
+
16
+ if (switch_op == 0) {
17
+ C[globalRow * N + globalCol] = A[globalRow * N + globalCol] <%= op %> B[0];
18
+ } else {
19
+ C[globalRow * N + globalCol] = B[0] <%= op %> A[globalRow * N + globalCol];
20
+ }
21
+ }
22
+
23
+ // 1D + Scalar floating point add op broadcast
24
+ __kernel void <%= fname%>_b_<%= dtype %>(const int M, const int N, const int M2, const int N2, const int switch_op,__global const <%= c_dtype %> *A, __global <%= c_dtype %> *B, __global <%= result_t %> *C) {
25
+ // Get the index of the current element to be processed
26
+ const int globalRow = get_global_id(0); // Row ID of C (0..M)
27
+ const int globalCol = get_global_id(1); // Col ID of C (0..N)
28
+
29
+ int b_m_index = globalRow;
30
+ int b_n_index = globalCol;
31
+
32
+ if ( b_m_index >= M2) {
33
+ b_m_index = b_m_index % M2;
34
+ };
35
+
36
+ if (b_n_index >= N2) {
37
+ b_n_index = b_n_index % N2;
38
+ }
39
+
40
+ if (switch_op == 0) {
41
+ C[globalRow * N + globalCol] = A[globalRow * N + globalCol] <%= op %> B[b_m_index * N2 + b_n_index];
42
+ } else {
43
+ C[globalRow * N + globalCol] = B[b_m_index * N2 + b_n_index] <%= op %> A[globalRow * N + globalCol];
44
+ }
45
+ }
@@ -0,0 +1,20 @@
1
+ % c_dtype = dtype_to_c_type(dtype)
2
+ % if TensorStream::Ops::FLOATING_POINT_TYPES.include?(dtype)
3
+ __kernel void abs_<%= dtype%>(const int M, const int N, __global const <%= c_dtype %> *A, __global <%= c_dtype %> *C) {
4
+ // Get the index of the current element to be processed
5
+ const int globalRow = get_global_id(0); // Row ID of C (0..M)
6
+ const int globalCol = get_global_id(1); // Col ID of C (0..N)
7
+
8
+ C[globalRow * N + globalCol] = fabs(A[globalRow * N + globalCol]);
9
+ }
10
+ % else
11
+ % %w[int int32].each do |dt|
12
+ __kernel void abs_<%= dt %>(const int M, const int N, __global const <%= c_dtype %> *A, __global <%= c_dtype %> *C) {
13
+ // Get the index of the current element to be processed
14
+ const int globalRow = get_global_id(0); // Row ID of C (0..M)
15
+ const int globalCol = get_global_id(1); // Col ID of C (0..N)
16
+
17
+ C[globalRow * N + globalCol] = fabs((float)A[globalRow * N + globalCol]);
18
+ }
19
+ % end
20
+ %end
@@ -0,0 +1,8 @@
1
+ % c_dtype = dtype_to_c_type(dtype)
2
+ __kernel void acos_<%= dtype %>(const int M, const int N, __global const <%= c_dtype %> *A, __global <%= c_dtype %> *C) {
3
+ // Get the index of the current element to be processed
4
+ const int globalRow = get_global_id(0); // Row ID of C (0..M)
5
+ const int globalCol = get_global_id(1); // Col ID of C (0..N)
6
+
7
+ C[globalRow * N + globalCol] = acos(A[globalRow * N + globalCol]);
8
+ }
@@ -0,0 +1,3 @@
1
+ % c_dtype = dtype_to_c_type(dtype)
2
+ % op = operator_to_c('add')
3
+ <%= render 'operand.cl', c_dtype: c_dtype, op: op, fname: 'add', dtype: "#{a}_#{b}", result_t: c_dtype %>
@@ -0,0 +1,23 @@
1
+ % c_dtype = dtype_to_c_type(dtype)
2
+ // same dimension add floating point op
3
+ __kernel void apply_adam_<%= dtype %>(const int M, const int N,
4
+ __global const <%= c_dtype %> *grad,
5
+ __global const <%= c_dtype %> *learning_rate,
6
+ __global const <%= c_dtype %> *beta1_power,
7
+ __global const <%= c_dtype %> *beta2_power,
8
+ __global const <%= c_dtype %> *beta1,
9
+ __global const <%= c_dtype %> *beta2,
10
+ __global const <%= c_dtype %> *epsilon,
11
+ __global <%= c_dtype %> *momentum,
12
+ __global <%= c_dtype %> *output, __global <%= c_dtype %> *v) {
13
+ // Get the index of the current element to be processed
14
+ const int globalRow = get_global_id(0); // Row ID of C (0..M)
15
+ const int globalCol = get_global_id(1); // Col ID of C (0..N)
16
+ const int index = globalRow * N + globalCol;
17
+
18
+ <%= c_dtype %> alpha = learning_rate[0] * sqrt(1.0 - beta2_power[0]) / (1.0 - beta1_power[0]);
19
+
20
+ momentum[index] += (grad[index] - momentum[index]) * (1.0 - beta1[0]);
21
+ v[index] += (grad[index] * grad[index] - v[index]) * (1.0 - beta2[0]);
22
+ output[index] -= (momentum[index] * alpha) / ( sqrt(v[index]) + epsilon[0] );
23
+ }