aidge-export-cpp 0.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- aidge_export_cpp/__init__.py +16 -0
- aidge_export_cpp/_version.py +4 -0
- aidge_export_cpp/export.py +131 -0
- aidge_export_cpp/export_registry.py +10 -0
- aidge_export_cpp/kernels/activation.hpp +77 -0
- aidge_export_cpp/kernels/batchnorm.hpp +41 -0
- aidge_export_cpp/kernels/convolution.hpp +119 -0
- aidge_export_cpp/kernels/elemwise.hpp +171 -0
- aidge_export_cpp/kernels/fullyconnected.hpp +72 -0
- aidge_export_cpp/kernels/leakyrelu.hpp +25 -0
- aidge_export_cpp/kernels/macs.hpp +19 -0
- aidge_export_cpp/kernels/matmul.hpp +33 -0
- aidge_export_cpp/kernels/pooling.hpp +126 -0
- aidge_export_cpp/kernels/rescaling.hpp +16 -0
- aidge_export_cpp/operators.py +282 -0
- aidge_export_cpp/static/Makefile +30 -0
- aidge_export_cpp/static/include/network/typedefs.hpp +32 -0
- aidge_export_cpp/static/include/network/utils.hpp +149 -0
- aidge_export_cpp/templates/configuration/_def_io.jinja +14 -0
- aidge_export_cpp/templates/configuration/_meminfo.jinja +11 -0
- aidge_export_cpp/templates/configuration/activation_config.jinja +14 -0
- aidge_export_cpp/templates/configuration/batchnorm_config.jinja +11 -0
- aidge_export_cpp/templates/configuration/convolution_config.jinja +25 -0
- aidge_export_cpp/templates/configuration/elemwise_config.jinja +13 -0
- aidge_export_cpp/templates/configuration/fullyconnected_config.jinja +16 -0
- aidge_export_cpp/templates/configuration/leakyrelu_config.jinja +11 -0
- aidge_export_cpp/templates/configuration/matmul_config.jinja +15 -0
- aidge_export_cpp/templates/configuration/pooling_config.jinja +17 -0
- aidge_export_cpp/templates/data/inputs.jinja +58 -0
- aidge_export_cpp/templates/data/parameters.jinja +21 -0
- aidge_export_cpp/templates/kernel_forward/_mem_offset.jinja +6 -0
- aidge_export_cpp/templates/kernel_forward/_save_outputs.jinja +20 -0
- aidge_export_cpp/templates/kernel_forward/activation_forward.jinja +7 -0
- aidge_export_cpp/templates/kernel_forward/batchnorm_forward.jinja +9 -0
- aidge_export_cpp/templates/kernel_forward/convolution_forward.jinja +20 -0
- aidge_export_cpp/templates/kernel_forward/elemwise_forward.jinja +8 -0
- aidge_export_cpp/templates/kernel_forward/fullyconnected_forward.jinja +12 -0
- aidge_export_cpp/templates/kernel_forward/leakyrelu_forward.jinja +6 -0
- aidge_export_cpp/templates/kernel_forward/matmul_forward.jinja +5 -0
- aidge_export_cpp/templates/kernel_forward/pooling_forward.jinja +19 -0
- aidge_export_cpp/utils/__init__.py +27 -0
- aidge_export_cpp/utils/converter.py +18 -0
- aidge_export_cpp/utils/generation.py +51 -0
- aidge_export_cpp-0.2.0.dist-info/LICENSE +277 -0
- aidge_export_cpp-0.2.0.dist-info/METADATA +319 -0
- aidge_export_cpp-0.2.0.dist-info/RECORD +48 -0
- aidge_export_cpp-0.2.0.dist-info/WHEEL +5 -0
- aidge_export_cpp-0.2.0.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,16 @@
|
|
|
1
|
+
r"""
|
|
2
|
+
Aidge Export for CPP standalone projects
|
|
3
|
+
|
|
4
|
+
"""
|
|
5
|
+
from .export_registry import ExportLibCpp
|
|
6
|
+
|
|
7
|
+
from .operators import *
|
|
8
|
+
from collections import defaultdict
|
|
9
|
+
import aidge_core
|
|
10
|
+
|
|
11
|
+
from aidge_export_cpp.utils import ROOT
|
|
12
|
+
|
|
13
|
+
from ._version import *
|
|
14
|
+
|
|
15
|
+
from .export import *
|
|
16
|
+
|
|
@@ -0,0 +1,131 @@
|
|
|
1
|
+
import re
|
|
2
|
+
import os
|
|
3
|
+
import numpy as np
|
|
4
|
+
|
|
5
|
+
import aidge_core
|
|
6
|
+
|
|
7
|
+
from aidge_core.export_utils.code_generation import *
|
|
8
|
+
from aidge_core.mem_info import compute_default_mem_info
|
|
9
|
+
|
|
10
|
+
from aidge_export_cpp.utils import ROOT
|
|
11
|
+
from aidge_export_cpp.utils.converter import numpy_dtype2ctype
|
|
12
|
+
from aidge_export_cpp import ExportLibCpp
|
|
13
|
+
from aidge_export_cpp.utils.generation import *
|
|
14
|
+
# from aidge_export_cpp.memory import *
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
def generate_input_file(export_folder:str,
|
|
18
|
+
array_name:str,
|
|
19
|
+
array: np.ndarray):
|
|
20
|
+
|
|
21
|
+
# If directory doesn't exist, create it
|
|
22
|
+
if not os.path.exists(export_folder):
|
|
23
|
+
os.makedirs(export_folder)
|
|
24
|
+
|
|
25
|
+
generate_file(
|
|
26
|
+
file_path=f"{export_folder}/{array_name}.h",
|
|
27
|
+
template_path=str(ROOT / "templates" / "data" / "inputs.jinja"),
|
|
28
|
+
dims = array.shape,
|
|
29
|
+
data_t = numpy_dtype2ctype(array.dtype),
|
|
30
|
+
name = array_name,
|
|
31
|
+
values = array.tolist()
|
|
32
|
+
)
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
def export(export_folder_name, graphview, scheduler, mem_wrapping=False):
|
|
36
|
+
aidge_core.export_utils.scheduler_export(
|
|
37
|
+
scheduler,
|
|
38
|
+
export_folder_name,
|
|
39
|
+
ExportLibCpp,
|
|
40
|
+
memory_manager=compute_default_mem_info
|
|
41
|
+
)
|
|
42
|
+
|
|
43
|
+
# export_folder = Path().absolute() / export_folder_name
|
|
44
|
+
|
|
45
|
+
# os.makedirs(str(export_folder), exist_ok=True)
|
|
46
|
+
|
|
47
|
+
# dnn_folder = export_folder / "dnn"
|
|
48
|
+
# os.makedirs(str(dnn_folder), exist_ok=True)
|
|
49
|
+
|
|
50
|
+
# list_actions = []
|
|
51
|
+
# list_configs = []
|
|
52
|
+
# peak_mem, mem_info = compute_default_mem_info(scheduler)
|
|
53
|
+
# list_forward_nodes = scheduler.get_static_scheduling()
|
|
54
|
+
|
|
55
|
+
# for node in list_forward_nodes:
|
|
56
|
+
# if ExportLibCpp.exportable(node):
|
|
57
|
+
# op = ExportLibCpp.get_export_node(node)(node, mem_info[node])
|
|
58
|
+
# # For configuration files
|
|
59
|
+
# list_configs = op.export(dnn_folder, list_configs)
|
|
60
|
+
|
|
61
|
+
# # For forward file
|
|
62
|
+
# list_actions = op.forward(list_actions)
|
|
63
|
+
# else:
|
|
64
|
+
# raise RuntimeError(f"Operator not supported: {node.type()} !")
|
|
65
|
+
|
|
66
|
+
# # Memory management
|
|
67
|
+
# # stats_folder = export_folder / "statistics"
|
|
68
|
+
# # os.makedirs(str(stats_folder), exist_ok=True)
|
|
69
|
+
# # mem_size, mem_info = generate_optimized_memory_info(stats_folder, scheduler, mem_wrapping)
|
|
70
|
+
# # peak_mem, mem_info = compute_default_mem_info(scheduler)
|
|
71
|
+
|
|
72
|
+
# # Generate the memory file
|
|
73
|
+
# # generate_file(
|
|
74
|
+
# # str(dnn_folder / "memory" / "mem_info.h"),
|
|
75
|
+
# # str(ROOT / "templates" / "memory" / "mem_info.jinja"),
|
|
76
|
+
# # mem_size = mem_size,
|
|
77
|
+
# # mem_info_legends = MEMORY_INFO_TEMPLATE,
|
|
78
|
+
# # mem_info = mem_info
|
|
79
|
+
# # )
|
|
80
|
+
# # list_configs.append("memory/mem_info.h")
|
|
81
|
+
|
|
82
|
+
# # Get entry nodes
|
|
83
|
+
# # Store the datatype & name
|
|
84
|
+
# list_inputs_name = []
|
|
85
|
+
# for node in graphview.get_input_nodes():
|
|
86
|
+
# for idx, node_input_tuple in enumerate(node.inputs()):
|
|
87
|
+
# node_input, _ = node_input_tuple
|
|
88
|
+
# if node_input is None:
|
|
89
|
+
# export_type = aidge2c(node.get_operator().get_output(0).dtype())
|
|
90
|
+
# list_inputs_name.append((export_type, f"{node.name()}_input_{idx}"))
|
|
91
|
+
# elif node_input not in graphview.get_nodes():
|
|
92
|
+
# export_type = aidge2c(node_input.get_operator().get_output(0).dtype())
|
|
93
|
+
# list_inputs_name.append((export_type, node_input.name()))
|
|
94
|
+
|
|
95
|
+
|
|
96
|
+
# # Get output nodes
|
|
97
|
+
# # Store the datatype & name, like entry nodes
|
|
98
|
+
# list_outputs_name = []
|
|
99
|
+
# for node in graphview.get_nodes():
|
|
100
|
+
# if len(node.get_children()) == 0:
|
|
101
|
+
# export_type = aidge2c(node.get_operator().get_output(0).dtype())
|
|
102
|
+
# list_outputs_name.append((export_type, f"{node.name()}_output_0"))
|
|
103
|
+
|
|
104
|
+
# # Generate forward file
|
|
105
|
+
# # TODO: for now the mem type is bound for all intermediate results, should change.
|
|
106
|
+
# # Note that we may have all inputs constants, hence select output type
|
|
107
|
+
# assert len(list_outputs_name) >= 1, f"TODO: requires some output to determine mem type"
|
|
108
|
+
# mem_ctype = list_outputs_name[0][0]
|
|
109
|
+
# generate_file(
|
|
110
|
+
# str(dnn_folder / "src" / "forward.cpp"),
|
|
111
|
+
# str(ROOT / "templates" / "network" / "network_forward.jinja"),
|
|
112
|
+
# headers=set(list_configs),
|
|
113
|
+
# actions=list_actions,
|
|
114
|
+
# inputs= list_inputs_name,
|
|
115
|
+
# outputs=list_outputs_name,
|
|
116
|
+
# mem_ctype=mem_ctype,
|
|
117
|
+
# peak_mem=peak_mem
|
|
118
|
+
# )
|
|
119
|
+
|
|
120
|
+
# # Generate dnn API
|
|
121
|
+
# generate_file(
|
|
122
|
+
# str(dnn_folder / "include" / "dnn.hpp"),
|
|
123
|
+
# str(ROOT / "templates" / "network" / "dnn_header.jinja"),
|
|
124
|
+
# libraries=[],
|
|
125
|
+
# functions=get_functions_from_c_file(str(dnn_folder / "src" / "forward.cpp")),
|
|
126
|
+
# )
|
|
127
|
+
|
|
128
|
+
# # Copy all static files in the export
|
|
129
|
+
# shutil.copy(str(ROOT / "static" / "main.cpp"), str(export_folder))
|
|
130
|
+
# shutil.copy(str(ROOT / "static" / "Makefile"), str(export_folder))
|
|
131
|
+
# shutil.copytree(str(ROOT / "static" / "include"), str(dnn_folder / "include"), dirs_exist_ok=True)
|
|
@@ -0,0 +1,10 @@
|
|
|
1
|
+
from aidge_core.export_utils import ExportLib
|
|
2
|
+
from aidge_export_cpp.utils import ROOT
|
|
3
|
+
|
|
4
|
+
class ExportLibCpp(ExportLib):
|
|
5
|
+
_name="export_cpp"
|
|
6
|
+
static_files={
|
|
7
|
+
str(ROOT / "static" / "Makefile"): "",
|
|
8
|
+
str(ROOT / "static" / "include" / "network" / "typedefs.hpp"): "dnn/include/network",
|
|
9
|
+
str(ROOT / "static" / "include" / "network" / "utils.hpp"): "dnn/include/network",
|
|
10
|
+
}
|
|
@@ -0,0 +1,77 @@
|
|
|
1
|
+
#ifndef __AIDGE_EXPORT_CPP_KERNELS_ACTIVATION__
|
|
2
|
+
#define __AIDGE_EXPORT_CPP_KERNELS_ACTIVATION__
|
|
3
|
+
|
|
4
|
+
#include <type_traits>
|
|
5
|
+
#include "network/typedefs.hpp"
|
|
6
|
+
#include "network/utils.hpp"
|
|
7
|
+
#include "kernels/rescaling.hpp"
|
|
8
|
+
|
|
9
|
+
template<typename Output_T, typename T,
|
|
10
|
+
typename std::enable_if<std::is_floating_point<T>::value>::type* = nullptr>
|
|
11
|
+
__attribute__((always_inline)) inline
|
|
12
|
+
Output_T saturate (T value, int32_t /*sat*/)
|
|
13
|
+
{
|
|
14
|
+
return value;
|
|
15
|
+
}
|
|
16
|
+
|
|
17
|
+
template<typename Output_T, typename T,
|
|
18
|
+
typename std::enable_if<!std::is_floating_point<T>::value>::type* = nullptr>
|
|
19
|
+
__attribute__((always_inline)) inline
|
|
20
|
+
Output_T saturate (T value, uint32_t sat)
|
|
21
|
+
{
|
|
22
|
+
if (std::is_unsigned<Output_T>::value) {
|
|
23
|
+
return clamp(value, T(0), (T(1) << sat) - 1);
|
|
24
|
+
} else {
|
|
25
|
+
return clamp(value, -(T(1) << (sat - 1)), (T(1) << (sat - 1)) - 1);
|
|
26
|
+
}
|
|
27
|
+
}
|
|
28
|
+
|
|
29
|
+
template<typename Output_T,
|
|
30
|
+
typename Sum_T,
|
|
31
|
+
typename Rescaling_T>
|
|
32
|
+
__attribute__((always_inline)) inline
|
|
33
|
+
Output_T activation_forward_value (Sum_T weightedSum,
|
|
34
|
+
int output,
|
|
35
|
+
ActivationFunction_T func,
|
|
36
|
+
const Rescaling_T& __restrict rescaling)
|
|
37
|
+
{
|
|
38
|
+
switch(func) {
|
|
39
|
+
case Linear:
|
|
40
|
+
case Saturation: {
|
|
41
|
+
break;
|
|
42
|
+
}
|
|
43
|
+
case Rectifier: {
|
|
44
|
+
if(weightedSum <= 0) weightedSum = 0;
|
|
45
|
+
break;
|
|
46
|
+
}
|
|
47
|
+
default:
|
|
48
|
+
// Unsupported activation function
|
|
49
|
+
break;
|
|
50
|
+
}
|
|
51
|
+
|
|
52
|
+
// Value fixed here for now but it should be generated by
|
|
53
|
+
// the export module or determined by the type of Output_T
|
|
54
|
+
// For now only works for int8_t and uint8_t
|
|
55
|
+
const uint32_t NB_BITS = 8;
|
|
56
|
+
return saturate<Output_T>(rescaling(weightedSum, output), NB_BITS);
|
|
57
|
+
}
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
template<int NB_DATA,
|
|
61
|
+
ActivationFunction_T ACTIVATION,
|
|
62
|
+
typename Input_T, typename Output_T, typename Rescaling_T>
|
|
63
|
+
__attribute__((always_inline)) inline
|
|
64
|
+
void activation_forward (
|
|
65
|
+
const Input_T* __restrict inputs,
|
|
66
|
+
Output_T* __restrict outputs,
|
|
67
|
+
const Rescaling_T& __restrict rescaling)
|
|
68
|
+
{
|
|
69
|
+
for (int i = 0; i < NB_DATA; ++i)
|
|
70
|
+
{
|
|
71
|
+
outputs[i] = activation_forward_value<Output_T>(inputs[i], i, ACTIVATION, rescaling);
|
|
72
|
+
}
|
|
73
|
+
|
|
74
|
+
}
|
|
75
|
+
|
|
76
|
+
|
|
77
|
+
#endif // __AIDGE_EXPORT_CPP_KERNELS_ACTIVATION__
|
|
@@ -0,0 +1,41 @@
|
|
|
1
|
+
#ifndef __AIDGE_EXPORT_CPP_KERNELS_BATCHNORM__
|
|
2
|
+
#define __AIDGE_EXPORT_CPP_KERNELS_BATCHNORM__
|
|
3
|
+
|
|
4
|
+
#include "network/typedefs.hpp"
|
|
5
|
+
#include "kernels/rescaling.hpp"
|
|
6
|
+
#include <math.h>
|
|
7
|
+
|
|
8
|
+
// WARNING: this kernel only works for 32-bits floating point values
|
|
9
|
+
|
|
10
|
+
template<int NB_OUTPUTS,
|
|
11
|
+
int OUTPUTS_HEIGHT, int OUTPUTS_WIDTH,
|
|
12
|
+
ActivationFunction_T ACTIVATION,
|
|
13
|
+
typename Input_T, typename Output_T,
|
|
14
|
+
typename Param_T>
|
|
15
|
+
__attribute__((always_inline)) inline
|
|
16
|
+
void batchnorm_forward (
|
|
17
|
+
const Input_T* __restrict inputs,
|
|
18
|
+
Output_T* __restrict outputs,
|
|
19
|
+
const Param_T* __restrict biases,
|
|
20
|
+
const Param_T* __restrict variances,
|
|
21
|
+
const Param_T* __restrict means,
|
|
22
|
+
const Param_T* __restrict scales,
|
|
23
|
+
const double epsilon)
|
|
24
|
+
{
|
|
25
|
+
for (unsigned int output = 0; output < NB_OUTPUTS; ++output) {
|
|
26
|
+
const Output_T var = sqrt(variances[output] + epsilon);
|
|
27
|
+
|
|
28
|
+
for (int oy = 0; oy < OUTPUTS_HEIGHT; ++oy) {
|
|
29
|
+
for (int ox = 0; ox < OUTPUTS_WIDTH; ++ox) {
|
|
30
|
+
const int outputOffset = OUTPUTS_HEIGHT * oy + ox;
|
|
31
|
+
|
|
32
|
+
const Output_T normalized = (inputs[outputOffset + output] - means[output]) / var;
|
|
33
|
+
const Output_T sAs = scales[output] * normalized + biases[output];
|
|
34
|
+
outputs[outputOffset + output] = sat<Output_T>(sAs, output, ACTIVATION, NoScaling);
|
|
35
|
+
}
|
|
36
|
+
}
|
|
37
|
+
}
|
|
38
|
+
}
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
#endif // __AIDGE_EXPORT_CPP_KERNELS_BATCHNORM__
|
|
@@ -0,0 +1,119 @@
|
|
|
1
|
+
#ifndef __AIDGE_EXPORT_CPP_KERNELS_CONVOLUTION__
|
|
2
|
+
#define __AIDGE_EXPORT_CPP_KERNELS_CONVOLUTION__
|
|
3
|
+
|
|
4
|
+
#include "network/typedefs.hpp"
|
|
5
|
+
#include "kernels/rescaling.hpp"
|
|
6
|
+
#include "network/utils.hpp"
|
|
7
|
+
#include "kernels/macs.hpp"
|
|
8
|
+
#include "kernels/activation.hpp"
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
template<int NB_CHANNELS,
|
|
12
|
+
int CHANNELS_HEIGHT, int CHANNELS_WIDTH,
|
|
13
|
+
int NB_OUTPUTS,
|
|
14
|
+
int OUTPUTS_HEIGHT, int OUTPUTS_WIDTH,
|
|
15
|
+
int PADDING_Y, int PADDING_X,
|
|
16
|
+
int STRIDE_Y, int STRIDE_X,
|
|
17
|
+
int DILATION_Y, int DILATION_X,
|
|
18
|
+
int KERNEL_HEIGHT, int KERNEL_WIDTH,
|
|
19
|
+
ActivationFunction_T ACTIVATION,
|
|
20
|
+
typename Input_T, typename Output_T,
|
|
21
|
+
typename Weight_T, typename Bias_T,
|
|
22
|
+
typename Rescaling_T>
|
|
23
|
+
__attribute__((always_inline)) inline
|
|
24
|
+
void convolution_forward(
|
|
25
|
+
const Input_T* __restrict inputs,
|
|
26
|
+
Output_T* __restrict outputs,
|
|
27
|
+
const Weight_T* __restrict weights,
|
|
28
|
+
const Bias_T* __restrict biases,
|
|
29
|
+
const Rescaling_T& __restrict rescaling)
|
|
30
|
+
{
|
|
31
|
+
constexpr int DILATED_KERNEL_HEIGHT
|
|
32
|
+
= KERNEL_HEIGHT + (DILATION_Y - 1) * (KERNEL_HEIGHT - 1);
|
|
33
|
+
|
|
34
|
+
constexpr int DILATED_KERNEL_WIDTH
|
|
35
|
+
= KERNEL_WIDTH + (DILATION_X - 1) * (KERNEL_WIDTH - 1);
|
|
36
|
+
|
|
37
|
+
constexpr int OUTPUTS_HEIGHT_NOPAD
|
|
38
|
+
= (CHANNELS_HEIGHT - DILATION_Y * (KERNEL_HEIGHT - 1) - 1 + STRIDE_Y) / STRIDE_Y;
|
|
39
|
+
constexpr int OUTPUTS_WIDTH_NOPAD
|
|
40
|
+
= (CHANNELS_WIDTH - DILATION_X * (KERNEL_WIDTH - 1) - 1 + STRIDE_X) / STRIDE_X;
|
|
41
|
+
|
|
42
|
+
for (int oy = 0; oy < OUTPUTS_HEIGHT; ++oy) {
|
|
43
|
+
const int syMin = (PADDING_Y == 0) ? 0
|
|
44
|
+
: max(PADDING_Y - (oy * STRIDE_Y), 0);
|
|
45
|
+
const int syMax = (PADDING_Y == 0
|
|
46
|
+
&& OUTPUTS_HEIGHT == OUTPUTS_HEIGHT_NOPAD) ? DILATED_KERNEL_HEIGHT
|
|
47
|
+
: clamp(CHANNELS_HEIGHT + PADDING_Y - (oy * STRIDE_Y),
|
|
48
|
+
0, DILATED_KERNEL_HEIGHT);
|
|
49
|
+
const int iy = (oy * STRIDE_Y) - PADDING_Y;
|
|
50
|
+
|
|
51
|
+
#pragma omp parallel for collapse(2)
|
|
52
|
+
for (int ox = 0; ox < OUTPUTS_WIDTH; ++ox) {
|
|
53
|
+
for (int output = 0; output < NB_OUTPUTS; ++output) {
|
|
54
|
+
// moved to inner loop for collapsing -->
|
|
55
|
+
const int sxMin = (PADDING_X == 0) ? 0
|
|
56
|
+
: max(PADDING_X - (ox * STRIDE_X), 0);
|
|
57
|
+
const int sxMax = (PADDING_X == 0
|
|
58
|
+
&& OUTPUTS_WIDTH == OUTPUTS_WIDTH_NOPAD)
|
|
59
|
+
? DILATED_KERNEL_WIDTH
|
|
60
|
+
: clamp(CHANNELS_WIDTH + PADDING_X - (ox * STRIDE_X),
|
|
61
|
+
0, DILATED_KERNEL_WIDTH);
|
|
62
|
+
const int ix = (ox * STRIDE_X) - PADDING_X;
|
|
63
|
+
|
|
64
|
+
const int oPos = (ox + OUTPUTS_WIDTH * oy);
|
|
65
|
+
int oOffset = NB_OUTPUTS * oPos;
|
|
66
|
+
|
|
67
|
+
// <--
|
|
68
|
+
|
|
69
|
+
Bias_T weightedSum = biases[output];
|
|
70
|
+
|
|
71
|
+
for (int sy = 0; sy < KERNEL_HEIGHT; ++sy) {
|
|
72
|
+
if ((PADDING_Y != 0
|
|
73
|
+
|| OUTPUTS_HEIGHT != OUTPUTS_HEIGHT_NOPAD)
|
|
74
|
+
&& ((sy*DILATION_Y < syMin) || (sy*DILATION_Y >= syMax)))
|
|
75
|
+
{
|
|
76
|
+
continue;
|
|
77
|
+
}
|
|
78
|
+
|
|
79
|
+
const int iPos = ix + CHANNELS_WIDTH * (iy + sy*DILATION_Y);
|
|
80
|
+
int iOffset = NB_CHANNELS * iPos;
|
|
81
|
+
|
|
82
|
+
const int wOffset = (output*KERNEL_HEIGHT + sy) * KERNEL_WIDTH * NB_CHANNELS;
|
|
83
|
+
|
|
84
|
+
if (DILATION_X == 1 && ((PADDING_X == 0 && OUTPUTS_WIDTH == OUTPUTS_WIDTH_NOPAD)
|
|
85
|
+
|| sxMax - sxMin == KERNEL_WIDTH))
|
|
86
|
+
{
|
|
87
|
+
macsOnRange<KERNEL_WIDTH * NB_CHANNELS>(
|
|
88
|
+
inputs + iOffset,
|
|
89
|
+
weights + wOffset,
|
|
90
|
+
weightedSum);
|
|
91
|
+
}
|
|
92
|
+
else {
|
|
93
|
+
for (int sx = 0; sx < KERNEL_WIDTH; ++sx) {
|
|
94
|
+
if ((PADDING_X != 0
|
|
95
|
+
|| OUTPUTS_WIDTH != OUTPUTS_WIDTH_NOPAD)
|
|
96
|
+
&& ((sx*DILATION_X < sxMin) || (sx*DILATION_X >= sxMax)))
|
|
97
|
+
{
|
|
98
|
+
continue;
|
|
99
|
+
}
|
|
100
|
+
|
|
101
|
+
int iOffsetInRange = iOffset
|
|
102
|
+
+ sx * DILATION_X * NB_CHANNELS;
|
|
103
|
+
|
|
104
|
+
macsOnRange<NB_CHANNELS>(
|
|
105
|
+
// same input line so no wrapping can occur
|
|
106
|
+
inputs + iOffsetInRange,
|
|
107
|
+
weights + wOffset + sx * NB_CHANNELS,
|
|
108
|
+
weightedSum);
|
|
109
|
+
}
|
|
110
|
+
}
|
|
111
|
+
}
|
|
112
|
+
|
|
113
|
+
outputs[oOffset + output] = activation_forward_value<Output_T>(weightedSum, output, ACTIVATION, rescaling);
|
|
114
|
+
}
|
|
115
|
+
}
|
|
116
|
+
}
|
|
117
|
+
}
|
|
118
|
+
|
|
119
|
+
#endif // __AIDGE_EXPORT_CPP_KERNELS_CONVOLUTION__
|
|
@@ -0,0 +1,171 @@
|
|
|
1
|
+
#ifndef __AIDGE_EXPORT_CPP_KERNELS_ELEMWISE__
|
|
2
|
+
#define __AIDGE_EXPORT_CPP_KERNELS_ELEMWISE__
|
|
3
|
+
|
|
4
|
+
#include "network/typedefs.hpp"
|
|
5
|
+
#include "kernels/activation.hpp"
|
|
6
|
+
|
|
7
|
+
// Generic function for two inputs
|
|
8
|
+
|
|
9
|
+
template<int NB_ELTS,
|
|
10
|
+
ElemWise_T ELEM_OP,
|
|
11
|
+
ActivationFunction_T ACTIVATION,
|
|
12
|
+
typename Input_T, typename Output_T,
|
|
13
|
+
typename Rescaling_T>
|
|
14
|
+
__attribute__((always_inline)) inline
|
|
15
|
+
void elemwise_forward (
|
|
16
|
+
Output_T* __restrict outputs,
|
|
17
|
+
const Rescaling_T& __restrict rescaling,
|
|
18
|
+
const Input_T* __restrict inputs1,
|
|
19
|
+
const Input_T* __restrict inputs2)
|
|
20
|
+
{
|
|
21
|
+
if (std::is_floating_point<Input_T>::value)
|
|
22
|
+
{
|
|
23
|
+
Input_T val = 0;
|
|
24
|
+
|
|
25
|
+
switch (ELEM_OP) {
|
|
26
|
+
case Add: {
|
|
27
|
+
for (int i = 0; i < NB_ELTS; ++i) {
|
|
28
|
+
val = inputs1[i] + inputs2[i];
|
|
29
|
+
outputs[i] = activation_forward_value<Output_T>(val, i, ACTIVATION, rescaling);
|
|
30
|
+
}
|
|
31
|
+
break;
|
|
32
|
+
}
|
|
33
|
+
case Sub: {
|
|
34
|
+
for (int i = 0; i < NB_ELTS; ++i) {
|
|
35
|
+
val = inputs1[i] - inputs2[i];
|
|
36
|
+
outputs[i] = activation_forward_value<Output_T>(val, i, ACTIVATION, rescaling);
|
|
37
|
+
|
|
38
|
+
}
|
|
39
|
+
break;
|
|
40
|
+
}
|
|
41
|
+
case Mul: {
|
|
42
|
+
for (int i = 0; i < NB_ELTS; ++i) {
|
|
43
|
+
val = inputs1[i] * inputs2[i];
|
|
44
|
+
outputs[i] = activation_forward_value<Output_T>(val, i, ACTIVATION, rescaling);
|
|
45
|
+
}
|
|
46
|
+
break;
|
|
47
|
+
}
|
|
48
|
+
default: {
|
|
49
|
+
// Copy inputs1 in outputs for default case
|
|
50
|
+
for (int i = 0; i < NB_ELTS; ++i) {
|
|
51
|
+
val = inputs1[i];
|
|
52
|
+
outputs[i] = activation_forward_value<Output_T>(val, i, ACTIVATION, rescaling);
|
|
53
|
+
}
|
|
54
|
+
break;
|
|
55
|
+
}
|
|
56
|
+
}
|
|
57
|
+
}
|
|
58
|
+
else
|
|
59
|
+
{
|
|
60
|
+
int32_t val = 0;
|
|
61
|
+
|
|
62
|
+
switch (ELEM_OP) {
|
|
63
|
+
case Add: {
|
|
64
|
+
for (int i = 0; i < NB_ELTS; ++i) {
|
|
65
|
+
val = inputs1[i] + inputs2[i];
|
|
66
|
+
outputs[i] = activation_forward_value<Output_T>(val, i, ACTIVATION, rescaling);
|
|
67
|
+
}
|
|
68
|
+
break;
|
|
69
|
+
}
|
|
70
|
+
case Sub: {
|
|
71
|
+
for (int i = 0; i < NB_ELTS; ++i) {
|
|
72
|
+
val = inputs1[i] - inputs2[i];
|
|
73
|
+
outputs[i] = activation_forward_value<Output_T>(val, i, ACTIVATION, rescaling);
|
|
74
|
+
}
|
|
75
|
+
break;
|
|
76
|
+
}
|
|
77
|
+
case Mul: {
|
|
78
|
+
for (int i = 0; i < NB_ELTS; ++i) {
|
|
79
|
+
val = inputs1[i] * inputs2[i];
|
|
80
|
+
outputs[i] = activation_forward_value<Output_T>(val, i, ACTIVATION, rescaling);
|
|
81
|
+
}
|
|
82
|
+
break;
|
|
83
|
+
}
|
|
84
|
+
default: {
|
|
85
|
+
// Copy inputs1 in outputs for default case
|
|
86
|
+
for (int i = 0; i < NB_ELTS; ++i) {
|
|
87
|
+
val = inputs1[i];
|
|
88
|
+
outputs[i] = activation_forward_value<Output_T>(val, i, ACTIVATION, rescaling);
|
|
89
|
+
}
|
|
90
|
+
break;
|
|
91
|
+
}
|
|
92
|
+
}
|
|
93
|
+
}
|
|
94
|
+
}
|
|
95
|
+
|
|
96
|
+
|
|
97
|
+
// Generic function for multiple inputs
|
|
98
|
+
// Not working
|
|
99
|
+
|
|
100
|
+
// template<ElemWise_T ELEM_OP, typename Output_T>
|
|
101
|
+
// __attribute__((always_inline)) inline
|
|
102
|
+
// Output_T elemWise (int /*pos*/, int /*ch*/)
|
|
103
|
+
// {
|
|
104
|
+
// return 0;
|
|
105
|
+
// }
|
|
106
|
+
|
|
107
|
+
// template<ElemWise_T ELEM_OP,
|
|
108
|
+
// int NB_CHANNELS,
|
|
109
|
+
// // For next inputs
|
|
110
|
+
// int... ARGS,
|
|
111
|
+
// typename... INPUTS,
|
|
112
|
+
// // Types
|
|
113
|
+
// typename Input_T, typename Output_T>
|
|
114
|
+
// __attribute__((always_inline)) inline
|
|
115
|
+
// Output_T elemWise (int pos, int ch,
|
|
116
|
+
// const Input_T* __restrict firstInputs,
|
|
117
|
+
// INPUTS... inputs)
|
|
118
|
+
// {
|
|
119
|
+
// int iOffset = NB_CHANNELS * pos;
|
|
120
|
+
|
|
121
|
+
// return firstInputs[iOffset + ch]
|
|
122
|
+
// + elemWise<ELEM_OP, ARGS...>(pos, ch, inputs...);
|
|
123
|
+
// }
|
|
124
|
+
|
|
125
|
+
// template<// For all inputs
|
|
126
|
+
// int NB_CHANNELS,
|
|
127
|
+
// int CHANNELS_HEIGHT, int CHANNELS_WIDTH,
|
|
128
|
+
// int NB_ELTS,
|
|
129
|
+
// int OUTPUTS_HEIGHT, int OUTPUTS_WIDTH,
|
|
130
|
+
// ElemWise_T ELEM_OP,
|
|
131
|
+
// ActivationFunction_T ACTIVATION,
|
|
132
|
+
// // For next inputs
|
|
133
|
+
// int... ARGS,
|
|
134
|
+
// typename... INPUTS,
|
|
135
|
+
// // Types
|
|
136
|
+
// typename Input_T, typename Output_T,
|
|
137
|
+
// typename Rescaling_T>
|
|
138
|
+
// __attribute__((always_inline)) inline
|
|
139
|
+
// void elemWise_forward (
|
|
140
|
+
// Output_T* __restrict outputs,
|
|
141
|
+
// const Rescaling_T& __restrict rescaling,
|
|
142
|
+
// const Input_T* __restrict firstInputs,
|
|
143
|
+
// INPUTS... inputs)
|
|
144
|
+
// {
|
|
145
|
+
// for (int oy = 0; oy < OUTPUTS_HEIGHT; oy++) {
|
|
146
|
+
// for (int ox = 0; ox < OUTPUTS_WIDTH; ox++) {
|
|
147
|
+
// const int pos = (ox + OUTPUTS_WIDTH * oy);
|
|
148
|
+
// int oOffset = NB_ELTS * pos;
|
|
149
|
+
|
|
150
|
+
// for (int ch = 0; ch < NB_ELTS; ++ch) {
|
|
151
|
+
// const Add_T val = elemWise<ELEM_OP,
|
|
152
|
+
// INPUT_NB_CHANNELS,
|
|
153
|
+
// INPUT_MEM_CONT_OFFSET,
|
|
154
|
+
// INPUT_MEM_CONT_NB_ELTS,
|
|
155
|
+
// INPUT_MEM_WRAP_OFFSET,
|
|
156
|
+
// INPUT_MEM_WRAP_NB_ELTS,
|
|
157
|
+
// INPUT_MEM_STRIDE,
|
|
158
|
+
// ARGS...>(pos, ch, firstInputs, inputs...);
|
|
159
|
+
|
|
160
|
+
// outputs[oOffset + ch]
|
|
161
|
+
// = sat<Output_T>(val, ch, ACTIVATION, rescaling);
|
|
162
|
+
// }
|
|
163
|
+
// }
|
|
164
|
+
// }
|
|
165
|
+
// }
|
|
166
|
+
|
|
167
|
+
|
|
168
|
+
|
|
169
|
+
|
|
170
|
+
|
|
171
|
+
#endif // __AIDGE_EXPORT_CPP_KERNELS_ELEMWISE__
|
|
@@ -0,0 +1,72 @@
|
|
|
1
|
+
#ifndef __AIDGE_EXPORT_CPP_KERNELS_FULLYCONNECTED__
|
|
2
|
+
#define __AIDGE_EXPORT_CPP_KERNELS_FULLYCONNECTED__
|
|
3
|
+
|
|
4
|
+
#include "network/typedefs.hpp"
|
|
5
|
+
#include "kernels/rescaling.hpp"
|
|
6
|
+
#include "network/utils.hpp"
|
|
7
|
+
#include "kernels/macs.hpp"
|
|
8
|
+
#include "kernels/activation.hpp"
|
|
9
|
+
|
|
10
|
+
template<int NB_CHANNELS,
|
|
11
|
+
int CHANNELS_HEIGHT, int CHANNELS_WIDTH,
|
|
12
|
+
int NB_OUTPUTS,
|
|
13
|
+
int OUTPUTS_HEIGHT, int OUTPUTS_WIDTH,
|
|
14
|
+
ActivationFunction_T ACTIVATION,
|
|
15
|
+
typename Input_T, typename Output_T,
|
|
16
|
+
typename Weight_T, typename Bias_T,
|
|
17
|
+
typename Rescaling_T>
|
|
18
|
+
__attribute__((always_inline)) inline
|
|
19
|
+
void fullyconnected_forward (
|
|
20
|
+
const Input_T* __restrict inputs,
|
|
21
|
+
Output_T* __restrict outputs,
|
|
22
|
+
const Weight_T* __restrict weights,
|
|
23
|
+
const Bias_T* __restrict biases,
|
|
24
|
+
const Rescaling_T& __restrict rescaling)
|
|
25
|
+
{
|
|
26
|
+
// Warning, there is a trick here !
|
|
27
|
+
// To use this kernel, the inputs have to be in NHWC and the weights are in NCHW
|
|
28
|
+
// It is only an issue if the FC was after a flatten layer.
|
|
29
|
+
// Otherwise it is not an issue for the other FC because CHANNELS_WIDTH = CHANNELS_HEIGHT = 1
|
|
30
|
+
// Solution: Add a system to check dataformat
|
|
31
|
+
for (int och = 0; och < NB_OUTPUTS; och++) {
|
|
32
|
+
|
|
33
|
+
Bias_T weightedSum = biases[och];
|
|
34
|
+
|
|
35
|
+
for (int iy = 0; iy < CHANNELS_HEIGHT; ++iy) {
|
|
36
|
+
for (int ix = 0; ix < CHANNELS_WIDTH; ++ix) {
|
|
37
|
+
for (int ch = 0; ch < NB_CHANNELS; ++ch) {
|
|
38
|
+
weightedSum += inputs[CHANNELS_WIDTH*NB_CHANNELS*iy + NB_CHANNELS*ix + ch]
|
|
39
|
+
* weights[CHANNELS_HEIGHT*CHANNELS_WIDTH*NB_CHANNELS*och + CHANNELS_HEIGHT*CHANNELS_WIDTH*ch + CHANNELS_HEIGHT*iy + ix];
|
|
40
|
+
}
|
|
41
|
+
}
|
|
42
|
+
}
|
|
43
|
+
|
|
44
|
+
outputs[och] = activation_forward_value<Output_T>(weightedSum, och, ACTIVATION, rescaling);
|
|
45
|
+
}
|
|
46
|
+
/*
|
|
47
|
+
Here the kernel to use with inputs in NHWC and weights in NHWC
|
|
48
|
+
#pragma omp parallel for
|
|
49
|
+
for (int och = 0; och < NB_OUTPUTS; och++) {
|
|
50
|
+
|
|
51
|
+
Bias_T weightedSum = biases[och];
|
|
52
|
+
|
|
53
|
+
for (int iy = 0; iy < CHANNELS_HEIGHT; ++iy) {
|
|
54
|
+
const int iPos = (CHANNELS_WIDTH * iy);
|
|
55
|
+
int iOffset = NB_CHANNELS * iPos;
|
|
56
|
+
|
|
57
|
+
const int wOffset = NB_CHANNELS * CHANNELS_WIDTH
|
|
58
|
+
* (iy + CHANNELS_HEIGHT * och);
|
|
59
|
+
|
|
60
|
+
macsOnRange<NB_CHANNELS * CHANNELS_WIDTH>(
|
|
61
|
+
inputs + iOffset,
|
|
62
|
+
weights + wOffset,
|
|
63
|
+
weightedSum);
|
|
64
|
+
}
|
|
65
|
+
|
|
66
|
+
outputs[och] = activation_forward_value<Output_T>(weightedSum, och, ACTIVATION, rescaling);
|
|
67
|
+
}
|
|
68
|
+
*/
|
|
69
|
+
}
|
|
70
|
+
|
|
71
|
+
|
|
72
|
+
#endif // __AIDGE_EXPORT_CPP_KERNELS_FULLYCONNECTED__
|
|
@@ -0,0 +1,25 @@
|
|
|
1
|
+
#ifndef __AIDGE_EXPORT_CPP_KERNELS_LEAKYRELU__
|
|
2
|
+
#define __AIDGE_EXPORT_CPP_KERNELS_LEAKYRELU__
|
|
3
|
+
|
|
4
|
+
#include "network/typedefs.hpp"
|
|
5
|
+
|
|
6
|
+
template<int NB_DATA,
|
|
7
|
+
typename Input_T, typename Output_T>
|
|
8
|
+
__attribute__((always_inline)) inline
|
|
9
|
+
void leakyrelu_forward (
|
|
10
|
+
const Input_T* __restrict inputs,
|
|
11
|
+
Output_T* __restrict outputs,
|
|
12
|
+
const float negative_slope)
|
|
13
|
+
{
|
|
14
|
+
#pragma omp parallel for
|
|
15
|
+
for (int i = 0; i < NB_DATA; ++i) {
|
|
16
|
+
if (inputs[i] >= 0) {
|
|
17
|
+
outputs[i] = inputs[i];
|
|
18
|
+
} else {
|
|
19
|
+
outputs[i] = negative_slope * inputs[i];
|
|
20
|
+
}
|
|
21
|
+
}
|
|
22
|
+
}
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
#endif // __AIDGE_EXPORT_CPP_KERNELS_LEAKYRELU__
|