aidge-export-cpp 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (48) hide show
  1. aidge_export_cpp/__init__.py +16 -0
  2. aidge_export_cpp/_version.py +4 -0
  3. aidge_export_cpp/export.py +131 -0
  4. aidge_export_cpp/export_registry.py +10 -0
  5. aidge_export_cpp/kernels/activation.hpp +77 -0
  6. aidge_export_cpp/kernels/batchnorm.hpp +41 -0
  7. aidge_export_cpp/kernels/convolution.hpp +119 -0
  8. aidge_export_cpp/kernels/elemwise.hpp +171 -0
  9. aidge_export_cpp/kernels/fullyconnected.hpp +72 -0
  10. aidge_export_cpp/kernels/leakyrelu.hpp +25 -0
  11. aidge_export_cpp/kernels/macs.hpp +19 -0
  12. aidge_export_cpp/kernels/matmul.hpp +33 -0
  13. aidge_export_cpp/kernels/pooling.hpp +126 -0
  14. aidge_export_cpp/kernels/rescaling.hpp +16 -0
  15. aidge_export_cpp/operators.py +282 -0
  16. aidge_export_cpp/static/Makefile +30 -0
  17. aidge_export_cpp/static/include/network/typedefs.hpp +32 -0
  18. aidge_export_cpp/static/include/network/utils.hpp +149 -0
  19. aidge_export_cpp/templates/configuration/_def_io.jinja +14 -0
  20. aidge_export_cpp/templates/configuration/_meminfo.jinja +11 -0
  21. aidge_export_cpp/templates/configuration/activation_config.jinja +14 -0
  22. aidge_export_cpp/templates/configuration/batchnorm_config.jinja +11 -0
  23. aidge_export_cpp/templates/configuration/convolution_config.jinja +25 -0
  24. aidge_export_cpp/templates/configuration/elemwise_config.jinja +13 -0
  25. aidge_export_cpp/templates/configuration/fullyconnected_config.jinja +16 -0
  26. aidge_export_cpp/templates/configuration/leakyrelu_config.jinja +11 -0
  27. aidge_export_cpp/templates/configuration/matmul_config.jinja +15 -0
  28. aidge_export_cpp/templates/configuration/pooling_config.jinja +17 -0
  29. aidge_export_cpp/templates/data/inputs.jinja +58 -0
  30. aidge_export_cpp/templates/data/parameters.jinja +21 -0
  31. aidge_export_cpp/templates/kernel_forward/_mem_offset.jinja +6 -0
  32. aidge_export_cpp/templates/kernel_forward/_save_outputs.jinja +20 -0
  33. aidge_export_cpp/templates/kernel_forward/activation_forward.jinja +7 -0
  34. aidge_export_cpp/templates/kernel_forward/batchnorm_forward.jinja +9 -0
  35. aidge_export_cpp/templates/kernel_forward/convolution_forward.jinja +20 -0
  36. aidge_export_cpp/templates/kernel_forward/elemwise_forward.jinja +8 -0
  37. aidge_export_cpp/templates/kernel_forward/fullyconnected_forward.jinja +12 -0
  38. aidge_export_cpp/templates/kernel_forward/leakyrelu_forward.jinja +6 -0
  39. aidge_export_cpp/templates/kernel_forward/matmul_forward.jinja +5 -0
  40. aidge_export_cpp/templates/kernel_forward/pooling_forward.jinja +19 -0
  41. aidge_export_cpp/utils/__init__.py +27 -0
  42. aidge_export_cpp/utils/converter.py +18 -0
  43. aidge_export_cpp/utils/generation.py +51 -0
  44. aidge_export_cpp-0.2.0.dist-info/LICENSE +277 -0
  45. aidge_export_cpp-0.2.0.dist-info/METADATA +319 -0
  46. aidge_export_cpp-0.2.0.dist-info/RECORD +48 -0
  47. aidge_export_cpp-0.2.0.dist-info/WHEEL +5 -0
  48. aidge_export_cpp-0.2.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,16 @@
1
+ r"""
2
+ Aidge Export for CPP standalone projects
3
+
4
+ """
5
+ from .export_registry import ExportLibCpp
6
+
7
+ from .operators import *
8
+ from collections import defaultdict
9
+ import aidge_core
10
+
11
+ from aidge_export_cpp.utils import ROOT
12
+
13
+ from ._version import *
14
+
15
+ from .export import *
16
+
@@ -0,0 +1,4 @@
1
+ # file generated by setuptools_scm
2
+ # don't change, don't track in version control
3
+ __version__ = version = '0.2.0'
4
+ __version_tuple__ = version_tuple = (0, 2, 0)
@@ -0,0 +1,131 @@
1
+ import re
2
+ import os
3
+ import numpy as np
4
+
5
+ import aidge_core
6
+
7
+ from aidge_core.export_utils.code_generation import *
8
+ from aidge_core.mem_info import compute_default_mem_info
9
+
10
+ from aidge_export_cpp.utils import ROOT
11
+ from aidge_export_cpp.utils.converter import numpy_dtype2ctype
12
+ from aidge_export_cpp import ExportLibCpp
13
+ from aidge_export_cpp.utils.generation import *
14
+ # from aidge_export_cpp.memory import *
15
+
16
+
17
+ def generate_input_file(export_folder:str,
18
+ array_name:str,
19
+ array: np.ndarray):
20
+
21
+ # If directory doesn't exist, create it
22
+ if not os.path.exists(export_folder):
23
+ os.makedirs(export_folder)
24
+
25
+ generate_file(
26
+ file_path=f"{export_folder}/{array_name}.h",
27
+ template_path=str(ROOT / "templates" / "data" / "inputs.jinja"),
28
+ dims = array.shape,
29
+ data_t = numpy_dtype2ctype(array.dtype),
30
+ name = array_name,
31
+ values = array.tolist()
32
+ )
33
+
34
+
35
+ def export(export_folder_name, graphview, scheduler, mem_wrapping=False):
36
+ aidge_core.export_utils.scheduler_export(
37
+ scheduler,
38
+ export_folder_name,
39
+ ExportLibCpp,
40
+ memory_manager=compute_default_mem_info
41
+ )
42
+
43
+ # export_folder = Path().absolute() / export_folder_name
44
+
45
+ # os.makedirs(str(export_folder), exist_ok=True)
46
+
47
+ # dnn_folder = export_folder / "dnn"
48
+ # os.makedirs(str(dnn_folder), exist_ok=True)
49
+
50
+ # list_actions = []
51
+ # list_configs = []
52
+ # peak_mem, mem_info = compute_default_mem_info(scheduler)
53
+ # list_forward_nodes = scheduler.get_static_scheduling()
54
+
55
+ # for node in list_forward_nodes:
56
+ # if ExportLibCpp.exportable(node):
57
+ # op = ExportLibCpp.get_export_node(node)(node, mem_info[node])
58
+ # # For configuration files
59
+ # list_configs = op.export(dnn_folder, list_configs)
60
+
61
+ # # For forward file
62
+ # list_actions = op.forward(list_actions)
63
+ # else:
64
+ # raise RuntimeError(f"Operator not supported: {node.type()} !")
65
+
66
+ # # Memory management
67
+ # # stats_folder = export_folder / "statistics"
68
+ # # os.makedirs(str(stats_folder), exist_ok=True)
69
+ # # mem_size, mem_info = generate_optimized_memory_info(stats_folder, scheduler, mem_wrapping)
70
+ # # peak_mem, mem_info = compute_default_mem_info(scheduler)
71
+
72
+ # # Generate the memory file
73
+ # # generate_file(
74
+ # # str(dnn_folder / "memory" / "mem_info.h"),
75
+ # # str(ROOT / "templates" / "memory" / "mem_info.jinja"),
76
+ # # mem_size = mem_size,
77
+ # # mem_info_legends = MEMORY_INFO_TEMPLATE,
78
+ # # mem_info = mem_info
79
+ # # )
80
+ # # list_configs.append("memory/mem_info.h")
81
+
82
+ # # Get entry nodes
83
+ # # Store the datatype & name
84
+ # list_inputs_name = []
85
+ # for node in graphview.get_input_nodes():
86
+ # for idx, node_input_tuple in enumerate(node.inputs()):
87
+ # node_input, _ = node_input_tuple
88
+ # if node_input is None:
89
+ # export_type = aidge2c(node.get_operator().get_output(0).dtype())
90
+ # list_inputs_name.append((export_type, f"{node.name()}_input_{idx}"))
91
+ # elif node_input not in graphview.get_nodes():
92
+ # export_type = aidge2c(node_input.get_operator().get_output(0).dtype())
93
+ # list_inputs_name.append((export_type, node_input.name()))
94
+
95
+
96
+ # # Get output nodes
97
+ # # Store the datatype & name, like entry nodes
98
+ # list_outputs_name = []
99
+ # for node in graphview.get_nodes():
100
+ # if len(node.get_children()) == 0:
101
+ # export_type = aidge2c(node.get_operator().get_output(0).dtype())
102
+ # list_outputs_name.append((export_type, f"{node.name()}_output_0"))
103
+
104
+ # # Generate forward file
105
+ # # TODO: for now the mem type is bound for all intermediate results, should change.
106
+ # # Note that we may have all inputs constants, hence select output type
107
+ # assert len(list_outputs_name) >= 1, f"TODO: requires some output to determine mem type"
108
+ # mem_ctype = list_outputs_name[0][0]
109
+ # generate_file(
110
+ # str(dnn_folder / "src" / "forward.cpp"),
111
+ # str(ROOT / "templates" / "network" / "network_forward.jinja"),
112
+ # headers=set(list_configs),
113
+ # actions=list_actions,
114
+ # inputs= list_inputs_name,
115
+ # outputs=list_outputs_name,
116
+ # mem_ctype=mem_ctype,
117
+ # peak_mem=peak_mem
118
+ # )
119
+
120
+ # # Generate dnn API
121
+ # generate_file(
122
+ # str(dnn_folder / "include" / "dnn.hpp"),
123
+ # str(ROOT / "templates" / "network" / "dnn_header.jinja"),
124
+ # libraries=[],
125
+ # functions=get_functions_from_c_file(str(dnn_folder / "src" / "forward.cpp")),
126
+ # )
127
+
128
+ # # Copy all static files in the export
129
+ # shutil.copy(str(ROOT / "static" / "main.cpp"), str(export_folder))
130
+ # shutil.copy(str(ROOT / "static" / "Makefile"), str(export_folder))
131
+ # shutil.copytree(str(ROOT / "static" / "include"), str(dnn_folder / "include"), dirs_exist_ok=True)
@@ -0,0 +1,10 @@
1
+ from aidge_core.export_utils import ExportLib
2
+ from aidge_export_cpp.utils import ROOT
3
+
4
+ class ExportLibCpp(ExportLib):
5
+ _name="export_cpp"
6
+ static_files={
7
+ str(ROOT / "static" / "Makefile"): "",
8
+ str(ROOT / "static" / "include" / "network" / "typedefs.hpp"): "dnn/include/network",
9
+ str(ROOT / "static" / "include" / "network" / "utils.hpp"): "dnn/include/network",
10
+ }
@@ -0,0 +1,77 @@
1
+ #ifndef __AIDGE_EXPORT_CPP_KERNELS_ACTIVATION__
2
+ #define __AIDGE_EXPORT_CPP_KERNELS_ACTIVATION__
3
+
4
+ #include <type_traits>
5
+ #include "network/typedefs.hpp"
6
+ #include "network/utils.hpp"
7
+ #include "kernels/rescaling.hpp"
8
+
9
+ template<typename Output_T, typename T,
10
+ typename std::enable_if<std::is_floating_point<T>::value>::type* = nullptr>
11
+ __attribute__((always_inline)) inline
12
+ Output_T saturate (T value, int32_t /*sat*/)
13
+ {
14
+ return value;
15
+ }
16
+
17
+ template<typename Output_T, typename T,
18
+ typename std::enable_if<!std::is_floating_point<T>::value>::type* = nullptr>
19
+ __attribute__((always_inline)) inline
20
+ Output_T saturate (T value, uint32_t sat)
21
+ {
22
+ if (std::is_unsigned<Output_T>::value) {
23
+ return clamp(value, T(0), (T(1) << sat) - 1);
24
+ } else {
25
+ return clamp(value, -(T(1) << (sat - 1)), (T(1) << (sat - 1)) - 1);
26
+ }
27
+ }
28
+
29
+ template<typename Output_T,
30
+ typename Sum_T,
31
+ typename Rescaling_T>
32
+ __attribute__((always_inline)) inline
33
+ Output_T activation_forward_value (Sum_T weightedSum,
34
+ int output,
35
+ ActivationFunction_T func,
36
+ const Rescaling_T& __restrict rescaling)
37
+ {
38
+ switch(func) {
39
+ case Linear:
40
+ case Saturation: {
41
+ break;
42
+ }
43
+ case Rectifier: {
44
+ if(weightedSum <= 0) weightedSum = 0;
45
+ break;
46
+ }
47
+ default:
48
+ // Unsupported activation function
49
+ break;
50
+ }
51
+
52
+ // Value fixed here for now but it should be generated by
53
+ // the export module or determined by the type of Output_T
54
+ // For now only works for int8_t and uint8_t
55
+ const uint32_t NB_BITS = 8;
56
+ return saturate<Output_T>(rescaling(weightedSum, output), NB_BITS);
57
+ }
58
+
59
+
60
+ template<int NB_DATA,
61
+ ActivationFunction_T ACTIVATION,
62
+ typename Input_T, typename Output_T, typename Rescaling_T>
63
+ __attribute__((always_inline)) inline
64
+ void activation_forward (
65
+ const Input_T* __restrict inputs,
66
+ Output_T* __restrict outputs,
67
+ const Rescaling_T& __restrict rescaling)
68
+ {
69
+ for (int i = 0; i < NB_DATA; ++i)
70
+ {
71
+ outputs[i] = activation_forward_value<Output_T>(inputs[i], i, ACTIVATION, rescaling);
72
+ }
73
+
74
+ }
75
+
76
+
77
+ #endif // __AIDGE_EXPORT_CPP_KERNELS_ACTIVATION__
@@ -0,0 +1,41 @@
1
+ #ifndef __AIDGE_EXPORT_CPP_KERNELS_BATCHNORM__
2
+ #define __AIDGE_EXPORT_CPP_KERNELS_BATCHNORM__
3
+
4
+ #include "network/typedefs.hpp"
5
+ #include "kernels/rescaling.hpp"
6
+ #include <math.h>
7
+
8
+ // WARNING: this kernel only works for 32-bits floating point values
9
+
10
+ template<int NB_OUTPUTS,
11
+ int OUTPUTS_HEIGHT, int OUTPUTS_WIDTH,
12
+ ActivationFunction_T ACTIVATION,
13
+ typename Input_T, typename Output_T,
14
+ typename Param_T>
15
+ __attribute__((always_inline)) inline
16
+ void batchnorm_forward (
17
+ const Input_T* __restrict inputs,
18
+ Output_T* __restrict outputs,
19
+ const Param_T* __restrict biases,
20
+ const Param_T* __restrict variances,
21
+ const Param_T* __restrict means,
22
+ const Param_T* __restrict scales,
23
+ const double epsilon)
24
+ {
25
+ for (unsigned int output = 0; output < NB_OUTPUTS; ++output) {
26
+ const Output_T var = sqrt(variances[output] + epsilon);
27
+
28
+ for (int oy = 0; oy < OUTPUTS_HEIGHT; ++oy) {
29
+ for (int ox = 0; ox < OUTPUTS_WIDTH; ++ox) {
30
+ const int outputOffset = OUTPUTS_HEIGHT * oy + ox;
31
+
32
+ const Output_T normalized = (inputs[outputOffset + output] - means[output]) / var;
33
+ const Output_T sAs = scales[output] * normalized + biases[output];
34
+ outputs[outputOffset + output] = sat<Output_T>(sAs, output, ACTIVATION, NoScaling);
35
+ }
36
+ }
37
+ }
38
+ }
39
+
40
+
41
+ #endif // __AIDGE_EXPORT_CPP_KERNELS_BATCHNORM__
@@ -0,0 +1,119 @@
1
+ #ifndef __AIDGE_EXPORT_CPP_KERNELS_CONVOLUTION__
2
+ #define __AIDGE_EXPORT_CPP_KERNELS_CONVOLUTION__
3
+
4
+ #include "network/typedefs.hpp"
5
+ #include "kernels/rescaling.hpp"
6
+ #include "network/utils.hpp"
7
+ #include "kernels/macs.hpp"
8
+ #include "kernels/activation.hpp"
9
+
10
+
11
+ template<int NB_CHANNELS,
12
+ int CHANNELS_HEIGHT, int CHANNELS_WIDTH,
13
+ int NB_OUTPUTS,
14
+ int OUTPUTS_HEIGHT, int OUTPUTS_WIDTH,
15
+ int PADDING_Y, int PADDING_X,
16
+ int STRIDE_Y, int STRIDE_X,
17
+ int DILATION_Y, int DILATION_X,
18
+ int KERNEL_HEIGHT, int KERNEL_WIDTH,
19
+ ActivationFunction_T ACTIVATION,
20
+ typename Input_T, typename Output_T,
21
+ typename Weight_T, typename Bias_T,
22
+ typename Rescaling_T>
23
+ __attribute__((always_inline)) inline
24
+ void convolution_forward(
25
+ const Input_T* __restrict inputs,
26
+ Output_T* __restrict outputs,
27
+ const Weight_T* __restrict weights,
28
+ const Bias_T* __restrict biases,
29
+ const Rescaling_T& __restrict rescaling)
30
+ {
31
+ constexpr int DILATED_KERNEL_HEIGHT
32
+ = KERNEL_HEIGHT + (DILATION_Y - 1) * (KERNEL_HEIGHT - 1);
33
+
34
+ constexpr int DILATED_KERNEL_WIDTH
35
+ = KERNEL_WIDTH + (DILATION_X - 1) * (KERNEL_WIDTH - 1);
36
+
37
+ constexpr int OUTPUTS_HEIGHT_NOPAD
38
+ = (CHANNELS_HEIGHT - DILATION_Y * (KERNEL_HEIGHT - 1) - 1 + STRIDE_Y) / STRIDE_Y;
39
+ constexpr int OUTPUTS_WIDTH_NOPAD
40
+ = (CHANNELS_WIDTH - DILATION_X * (KERNEL_WIDTH - 1) - 1 + STRIDE_X) / STRIDE_X;
41
+
42
+ for (int oy = 0; oy < OUTPUTS_HEIGHT; ++oy) {
43
+ const int syMin = (PADDING_Y == 0) ? 0
44
+ : max(PADDING_Y - (oy * STRIDE_Y), 0);
45
+ const int syMax = (PADDING_Y == 0
46
+ && OUTPUTS_HEIGHT == OUTPUTS_HEIGHT_NOPAD) ? DILATED_KERNEL_HEIGHT
47
+ : clamp(CHANNELS_HEIGHT + PADDING_Y - (oy * STRIDE_Y),
48
+ 0, DILATED_KERNEL_HEIGHT);
49
+ const int iy = (oy * STRIDE_Y) - PADDING_Y;
50
+
51
+ #pragma omp parallel for collapse(2)
52
+ for (int ox = 0; ox < OUTPUTS_WIDTH; ++ox) {
53
+ for (int output = 0; output < NB_OUTPUTS; ++output) {
54
+ // moved to inner loop for collapsing -->
55
+ const int sxMin = (PADDING_X == 0) ? 0
56
+ : max(PADDING_X - (ox * STRIDE_X), 0);
57
+ const int sxMax = (PADDING_X == 0
58
+ && OUTPUTS_WIDTH == OUTPUTS_WIDTH_NOPAD)
59
+ ? DILATED_KERNEL_WIDTH
60
+ : clamp(CHANNELS_WIDTH + PADDING_X - (ox * STRIDE_X),
61
+ 0, DILATED_KERNEL_WIDTH);
62
+ const int ix = (ox * STRIDE_X) - PADDING_X;
63
+
64
+ const int oPos = (ox + OUTPUTS_WIDTH * oy);
65
+ int oOffset = NB_OUTPUTS * oPos;
66
+
67
+ // <--
68
+
69
+ Bias_T weightedSum = biases[output];
70
+
71
+ for (int sy = 0; sy < KERNEL_HEIGHT; ++sy) {
72
+ if ((PADDING_Y != 0
73
+ || OUTPUTS_HEIGHT != OUTPUTS_HEIGHT_NOPAD)
74
+ && ((sy*DILATION_Y < syMin) || (sy*DILATION_Y >= syMax)))
75
+ {
76
+ continue;
77
+ }
78
+
79
+ const int iPos = ix + CHANNELS_WIDTH * (iy + sy*DILATION_Y);
80
+ int iOffset = NB_CHANNELS * iPos;
81
+
82
+ const int wOffset = (output*KERNEL_HEIGHT + sy) * KERNEL_WIDTH * NB_CHANNELS;
83
+
84
+ if (DILATION_X == 1 && ((PADDING_X == 0 && OUTPUTS_WIDTH == OUTPUTS_WIDTH_NOPAD)
85
+ || sxMax - sxMin == KERNEL_WIDTH))
86
+ {
87
+ macsOnRange<KERNEL_WIDTH * NB_CHANNELS>(
88
+ inputs + iOffset,
89
+ weights + wOffset,
90
+ weightedSum);
91
+ }
92
+ else {
93
+ for (int sx = 0; sx < KERNEL_WIDTH; ++sx) {
94
+ if ((PADDING_X != 0
95
+ || OUTPUTS_WIDTH != OUTPUTS_WIDTH_NOPAD)
96
+ && ((sx*DILATION_X < sxMin) || (sx*DILATION_X >= sxMax)))
97
+ {
98
+ continue;
99
+ }
100
+
101
+ int iOffsetInRange = iOffset
102
+ + sx * DILATION_X * NB_CHANNELS;
103
+
104
+ macsOnRange<NB_CHANNELS>(
105
+ // same input line so no wrapping can occur
106
+ inputs + iOffsetInRange,
107
+ weights + wOffset + sx * NB_CHANNELS,
108
+ weightedSum);
109
+ }
110
+ }
111
+ }
112
+
113
+ outputs[oOffset + output] = activation_forward_value<Output_T>(weightedSum, output, ACTIVATION, rescaling);
114
+ }
115
+ }
116
+ }
117
+ }
118
+
119
+ #endif // __AIDGE_EXPORT_CPP_KERNELS_CONVOLUTION__
@@ -0,0 +1,171 @@
1
+ #ifndef __AIDGE_EXPORT_CPP_KERNELS_ELEMWISE__
2
+ #define __AIDGE_EXPORT_CPP_KERNELS_ELEMWISE__
3
+
4
+ #include "network/typedefs.hpp"
5
+ #include "kernels/activation.hpp"
6
+
7
+ // Generic function for two inputs
8
+
9
+ template<int NB_ELTS,
10
+ ElemWise_T ELEM_OP,
11
+ ActivationFunction_T ACTIVATION,
12
+ typename Input_T, typename Output_T,
13
+ typename Rescaling_T>
14
+ __attribute__((always_inline)) inline
15
+ void elemwise_forward (
16
+ Output_T* __restrict outputs,
17
+ const Rescaling_T& __restrict rescaling,
18
+ const Input_T* __restrict inputs1,
19
+ const Input_T* __restrict inputs2)
20
+ {
21
+ if (std::is_floating_point<Input_T>::value)
22
+ {
23
+ Input_T val = 0;
24
+
25
+ switch (ELEM_OP) {
26
+ case Add: {
27
+ for (int i = 0; i < NB_ELTS; ++i) {
28
+ val = inputs1[i] + inputs2[i];
29
+ outputs[i] = activation_forward_value<Output_T>(val, i, ACTIVATION, rescaling);
30
+ }
31
+ break;
32
+ }
33
+ case Sub: {
34
+ for (int i = 0; i < NB_ELTS; ++i) {
35
+ val = inputs1[i] - inputs2[i];
36
+ outputs[i] = activation_forward_value<Output_T>(val, i, ACTIVATION, rescaling);
37
+
38
+ }
39
+ break;
40
+ }
41
+ case Mul: {
42
+ for (int i = 0; i < NB_ELTS; ++i) {
43
+ val = inputs1[i] * inputs2[i];
44
+ outputs[i] = activation_forward_value<Output_T>(val, i, ACTIVATION, rescaling);
45
+ }
46
+ break;
47
+ }
48
+ default: {
49
+ // Copy inputs1 in outputs for default case
50
+ for (int i = 0; i < NB_ELTS; ++i) {
51
+ val = inputs1[i];
52
+ outputs[i] = activation_forward_value<Output_T>(val, i, ACTIVATION, rescaling);
53
+ }
54
+ break;
55
+ }
56
+ }
57
+ }
58
+ else
59
+ {
60
+ int32_t val = 0;
61
+
62
+ switch (ELEM_OP) {
63
+ case Add: {
64
+ for (int i = 0; i < NB_ELTS; ++i) {
65
+ val = inputs1[i] + inputs2[i];
66
+ outputs[i] = activation_forward_value<Output_T>(val, i, ACTIVATION, rescaling);
67
+ }
68
+ break;
69
+ }
70
+ case Sub: {
71
+ for (int i = 0; i < NB_ELTS; ++i) {
72
+ val = inputs1[i] - inputs2[i];
73
+ outputs[i] = activation_forward_value<Output_T>(val, i, ACTIVATION, rescaling);
74
+ }
75
+ break;
76
+ }
77
+ case Mul: {
78
+ for (int i = 0; i < NB_ELTS; ++i) {
79
+ val = inputs1[i] * inputs2[i];
80
+ outputs[i] = activation_forward_value<Output_T>(val, i, ACTIVATION, rescaling);
81
+ }
82
+ break;
83
+ }
84
+ default: {
85
+ // Copy inputs1 in outputs for default case
86
+ for (int i = 0; i < NB_ELTS; ++i) {
87
+ val = inputs1[i];
88
+ outputs[i] = activation_forward_value<Output_T>(val, i, ACTIVATION, rescaling);
89
+ }
90
+ break;
91
+ }
92
+ }
93
+ }
94
+ }
95
+
96
+
97
+ // Generic function for multiple inputs
98
+ // Not working
99
+
100
+ // template<ElemWise_T ELEM_OP, typename Output_T>
101
+ // __attribute__((always_inline)) inline
102
+ // Output_T elemWise (int /*pos*/, int /*ch*/)
103
+ // {
104
+ // return 0;
105
+ // }
106
+
107
+ // template<ElemWise_T ELEM_OP,
108
+ // int NB_CHANNELS,
109
+ // // For next inputs
110
+ // int... ARGS,
111
+ // typename... INPUTS,
112
+ // // Types
113
+ // typename Input_T, typename Output_T>
114
+ // __attribute__((always_inline)) inline
115
+ // Output_T elemWise (int pos, int ch,
116
+ // const Input_T* __restrict firstInputs,
117
+ // INPUTS... inputs)
118
+ // {
119
+ // int iOffset = NB_CHANNELS * pos;
120
+
121
+ // return firstInputs[iOffset + ch]
122
+ // + elemWise<ELEM_OP, ARGS...>(pos, ch, inputs...);
123
+ // }
124
+
125
+ // template<// For all inputs
126
+ // int NB_CHANNELS,
127
+ // int CHANNELS_HEIGHT, int CHANNELS_WIDTH,
128
+ // int NB_ELTS,
129
+ // int OUTPUTS_HEIGHT, int OUTPUTS_WIDTH,
130
+ // ElemWise_T ELEM_OP,
131
+ // ActivationFunction_T ACTIVATION,
132
+ // // For next inputs
133
+ // int... ARGS,
134
+ // typename... INPUTS,
135
+ // // Types
136
+ // typename Input_T, typename Output_T,
137
+ // typename Rescaling_T>
138
+ // __attribute__((always_inline)) inline
139
+ // void elemWise_forward (
140
+ // Output_T* __restrict outputs,
141
+ // const Rescaling_T& __restrict rescaling,
142
+ // const Input_T* __restrict firstInputs,
143
+ // INPUTS... inputs)
144
+ // {
145
+ // for (int oy = 0; oy < OUTPUTS_HEIGHT; oy++) {
146
+ // for (int ox = 0; ox < OUTPUTS_WIDTH; ox++) {
147
+ // const int pos = (ox + OUTPUTS_WIDTH * oy);
148
+ // int oOffset = NB_ELTS * pos;
149
+
150
+ // for (int ch = 0; ch < NB_ELTS; ++ch) {
151
+ // const Add_T val = elemWise<ELEM_OP,
152
+ // INPUT_NB_CHANNELS,
153
+ // INPUT_MEM_CONT_OFFSET,
154
+ // INPUT_MEM_CONT_NB_ELTS,
155
+ // INPUT_MEM_WRAP_OFFSET,
156
+ // INPUT_MEM_WRAP_NB_ELTS,
157
+ // INPUT_MEM_STRIDE,
158
+ // ARGS...>(pos, ch, firstInputs, inputs...);
159
+
160
+ // outputs[oOffset + ch]
161
+ // = sat<Output_T>(val, ch, ACTIVATION, rescaling);
162
+ // }
163
+ // }
164
+ // }
165
+ // }
166
+
167
+
168
+
169
+
170
+
171
+ #endif // __AIDGE_EXPORT_CPP_KERNELS_ELEMWISE__
@@ -0,0 +1,72 @@
1
+ #ifndef __AIDGE_EXPORT_CPP_KERNELS_FULLYCONNECTED__
2
+ #define __AIDGE_EXPORT_CPP_KERNELS_FULLYCONNECTED__
3
+
4
+ #include "network/typedefs.hpp"
5
+ #include "kernels/rescaling.hpp"
6
+ #include "network/utils.hpp"
7
+ #include "kernels/macs.hpp"
8
+ #include "kernels/activation.hpp"
9
+
10
+ template<int NB_CHANNELS,
11
+ int CHANNELS_HEIGHT, int CHANNELS_WIDTH,
12
+ int NB_OUTPUTS,
13
+ int OUTPUTS_HEIGHT, int OUTPUTS_WIDTH,
14
+ ActivationFunction_T ACTIVATION,
15
+ typename Input_T, typename Output_T,
16
+ typename Weight_T, typename Bias_T,
17
+ typename Rescaling_T>
18
+ __attribute__((always_inline)) inline
19
+ void fullyconnected_forward (
20
+ const Input_T* __restrict inputs,
21
+ Output_T* __restrict outputs,
22
+ const Weight_T* __restrict weights,
23
+ const Bias_T* __restrict biases,
24
+ const Rescaling_T& __restrict rescaling)
25
+ {
26
+ // Warning, there is a trick here !
27
+ // To use this kernel, the inputs have to be in NHWC and the weights are in NCHW
28
+ // It is only an issue if the FC was after a flatten layer.
29
+ // Otherwise it is not an issue for the other FC because CHANNELS_WIDTH = CHANNELS_HEIGHT = 1
30
+ // Solution: Add a system to check dataformat
31
+ for (int och = 0; och < NB_OUTPUTS; och++) {
32
+
33
+ Bias_T weightedSum = biases[och];
34
+
35
+ for (int iy = 0; iy < CHANNELS_HEIGHT; ++iy) {
36
+ for (int ix = 0; ix < CHANNELS_WIDTH; ++ix) {
37
+ for (int ch = 0; ch < NB_CHANNELS; ++ch) {
38
+ weightedSum += inputs[CHANNELS_WIDTH*NB_CHANNELS*iy + NB_CHANNELS*ix + ch]
39
+ * weights[CHANNELS_HEIGHT*CHANNELS_WIDTH*NB_CHANNELS*och + CHANNELS_HEIGHT*CHANNELS_WIDTH*ch + CHANNELS_HEIGHT*iy + ix];
40
+ }
41
+ }
42
+ }
43
+
44
+ outputs[och] = activation_forward_value<Output_T>(weightedSum, och, ACTIVATION, rescaling);
45
+ }
46
+ /*
47
+ Here the kernel to use with inputs in NHWC and weights in NHWC
48
+ #pragma omp parallel for
49
+ for (int och = 0; och < NB_OUTPUTS; och++) {
50
+
51
+ Bias_T weightedSum = biases[och];
52
+
53
+ for (int iy = 0; iy < CHANNELS_HEIGHT; ++iy) {
54
+ const int iPos = (CHANNELS_WIDTH * iy);
55
+ int iOffset = NB_CHANNELS * iPos;
56
+
57
+ const int wOffset = NB_CHANNELS * CHANNELS_WIDTH
58
+ * (iy + CHANNELS_HEIGHT * och);
59
+
60
+ macsOnRange<NB_CHANNELS * CHANNELS_WIDTH>(
61
+ inputs + iOffset,
62
+ weights + wOffset,
63
+ weightedSum);
64
+ }
65
+
66
+ outputs[och] = activation_forward_value<Output_T>(weightedSum, och, ACTIVATION, rescaling);
67
+ }
68
+ */
69
+ }
70
+
71
+
72
+ #endif // __AIDGE_EXPORT_CPP_KERNELS_FULLYCONNECTED__
@@ -0,0 +1,25 @@
1
+ #ifndef __AIDGE_EXPORT_CPP_KERNELS_LEAKYRELU__
2
+ #define __AIDGE_EXPORT_CPP_KERNELS_LEAKYRELU__
3
+
4
+ #include "network/typedefs.hpp"
5
+
6
+ template<int NB_DATA,
7
+ typename Input_T, typename Output_T>
8
+ __attribute__((always_inline)) inline
9
+ void leakyrelu_forward (
10
+ const Input_T* __restrict inputs,
11
+ Output_T* __restrict outputs,
12
+ const float negative_slope)
13
+ {
14
+ #pragma omp parallel for
15
+ for (int i = 0; i < NB_DATA; ++i) {
16
+ if (inputs[i] >= 0) {
17
+ outputs[i] = inputs[i];
18
+ } else {
19
+ outputs[i] = negative_slope * inputs[i];
20
+ }
21
+ }
22
+ }
23
+
24
+
25
+ #endif // __AIDGE_EXPORT_CPP_KERNELS_LEAKYRELU__