da4ml 0.2.1__py3-none-any.whl → 0.3.0.post1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of da4ml might be problematic. Click here for more details.
- da4ml/_version.py +2 -2
- da4ml/cmvm/types.py +95 -15
- da4ml/codegen/__init__.py +5 -4
- da4ml/codegen/cpp/__init__.py +2 -1
- da4ml/codegen/cpp/cpp_codegen.py +56 -23
- da4ml/codegen/cpp/hls_model.py +252 -0
- da4ml/codegen/cpp/source/ap_types/ap_binary.h +78 -0
- da4ml/codegen/cpp/source/ap_types/ap_common.h +376 -0
- da4ml/codegen/cpp/source/ap_types/ap_decl.h +212 -0
- da4ml/codegen/cpp/source/ap_types/ap_fixed.h +360 -0
- da4ml/codegen/cpp/source/ap_types/ap_fixed_base.h +2354 -0
- da4ml/codegen/cpp/source/ap_types/ap_fixed_ref.h +718 -0
- da4ml/codegen/cpp/source/ap_types/ap_fixed_special.h +230 -0
- da4ml/codegen/cpp/source/ap_types/ap_int.h +330 -0
- da4ml/codegen/cpp/source/ap_types/ap_int_base.h +1885 -0
- da4ml/codegen/cpp/source/ap_types/ap_int_ref.h +1346 -0
- da4ml/codegen/cpp/source/ap_types/ap_int_special.h +223 -0
- da4ml/codegen/cpp/source/ap_types/ap_shift_reg.h +138 -0
- da4ml/codegen/cpp/source/ap_types/etc/ap_private.h +7199 -0
- da4ml/codegen/cpp/source/ap_types/hls_math.h +27 -0
- da4ml/codegen/cpp/source/ap_types/hls_stream.h +263 -0
- da4ml/codegen/cpp/source/ap_types/utils/x_hls_utils.h +80 -0
- da4ml/codegen/cpp/source/binder_util.hh +56 -0
- da4ml/codegen/cpp/source/build_binder.mk +24 -0
- da4ml/codegen/cpp/source/{vitis.h → vitis_bitshift.hh} +1 -1
- da4ml/codegen/verilog/__init__.py +2 -3
- da4ml/codegen/verilog/comb.py +65 -24
- da4ml/codegen/verilog/io_wrapper.py +36 -141
- da4ml/codegen/verilog/source/binder_util.hh +72 -0
- da4ml/codegen/verilog/source/mux.v +58 -0
- da4ml/codegen/verilog/source/negative.v +28 -0
- da4ml/codegen/verilog/source/shift_adder.v +4 -1
- da4ml/codegen/verilog/source/template.xdc +3 -0
- da4ml/codegen/verilog/verilog_model.py +36 -12
- da4ml/converter/__init__.py +0 -0
- da4ml/converter/hgq2/parser.py +105 -0
- da4ml/converter/hgq2/replica.py +383 -0
- da4ml/trace/__init__.py +2 -2
- da4ml/trace/fixed_variable.py +175 -16
- da4ml/trace/fixed_variable_array.py +109 -4
- da4ml/trace/ops/__init__.py +22 -6
- da4ml/trace/ops/conv_utils.py +147 -15
- da4ml/trace/ops/einsum_utils.py +9 -6
- da4ml/trace/ops/reduce_utils.py +103 -0
- da4ml/trace/pipeline.py +36 -34
- da4ml/trace/tracer.py +37 -7
- da4ml-0.3.0.post1.dist-info/METADATA +107 -0
- da4ml-0.3.0.post1.dist-info/RECORD +64 -0
- da4ml/codegen/cpp/source/vitis_bridge.h +0 -17
- da4ml-0.2.1.dist-info/METADATA +0 -65
- da4ml-0.2.1.dist-info/RECORD +0 -39
- /da4ml/codegen/verilog/source/{ioutils.hh → ioutil.hh} +0 -0
- {da4ml-0.2.1.dist-info → da4ml-0.3.0.post1.dist-info}/WHEEL +0 -0
- {da4ml-0.2.1.dist-info → da4ml-0.3.0.post1.dist-info}/licenses/LICENSE +0 -0
- {da4ml-0.2.1.dist-info → da4ml-0.3.0.post1.dist-info}/top_level.txt +0 -0
|
@@ -7,7 +7,7 @@ def hetero_io_map(qints: list[QInterval], merge: bool = False):
|
|
|
7
7
|
N = len(qints)
|
|
8
8
|
ks, _is, fs = zip(*map(_minimal_kif, qints))
|
|
9
9
|
Is = [_i + _k for _i, _k in zip(_is, ks)]
|
|
10
|
-
max_I, max_f = max(
|
|
10
|
+
max_I, max_f = max(_is) + max(ks), max(fs)
|
|
11
11
|
max_bw = max_I + max_f
|
|
12
12
|
width_regular, width_packed = max_bw * N, sum(Is) + sum(fs)
|
|
13
13
|
|
|
@@ -32,11 +32,16 @@ def hetero_io_map(qints: list[QInterval], merge: bool = False):
|
|
|
32
32
|
copy_from = hetero[i][0] if ks[i] else -1
|
|
33
33
|
pads.append((base + max_bw - 1, base + max_bw - bias_high, copy_from))
|
|
34
34
|
|
|
35
|
+
mask = list(high < low for high, low in hetero)
|
|
36
|
+
regular = [r for r, m in zip(regular, mask) if not m]
|
|
37
|
+
hetero = [h for h, m in zip(hetero, mask) if not m]
|
|
38
|
+
|
|
35
39
|
if not merge:
|
|
36
40
|
return regular, hetero, pads, (width_regular, width_packed)
|
|
37
41
|
|
|
38
42
|
# Merging consecutive intervals when possible
|
|
39
|
-
|
|
43
|
+
NN = len(regular) - 2
|
|
44
|
+
for i in range(NN, -1, -1):
|
|
40
45
|
this_high = regular[i][0]
|
|
41
46
|
next_low = regular[i + 1][1]
|
|
42
47
|
if next_low - this_high != 1:
|
|
@@ -65,6 +70,8 @@ def generate_io_wrapper(sol: Solution | CascadedSolution, module_name: str, pipe
|
|
|
65
70
|
_out_assignment: list[tuple[int, str]] = []
|
|
66
71
|
|
|
67
72
|
for i, ((ih, jh), (ir, jr)) in enumerate(zip(het_out, reg_out)):
|
|
73
|
+
if ih == jh - 1:
|
|
74
|
+
continue
|
|
68
75
|
_out_assignment.append((ih, f'assign out[{ir}:{jr}] = packed_out[{ih}:{jh}];'))
|
|
69
76
|
|
|
70
77
|
for i, (i, j, copy_from) in enumerate(pad_out):
|
|
@@ -86,12 +93,12 @@ def generate_io_wrapper(sol: Solution | CascadedSolution, module_name: str, pipe
|
|
|
86
93
|
|
|
87
94
|
module {module_name}_wrapper ({clk_and_rst_inp}
|
|
88
95
|
// verilator lint_off UNUSEDSIGNAL
|
|
89
|
-
input [{w_reg_in-1}:0] inp,
|
|
96
|
+
input [{w_reg_in - 1}:0] inp,
|
|
90
97
|
// verilator lint_on UNUSEDSIGNAL
|
|
91
|
-
output [{w_reg_out-1}:0] out
|
|
98
|
+
output [{w_reg_out - 1}:0] out
|
|
92
99
|
);
|
|
93
|
-
wire [{w_het_in-1}:0] packed_inp;
|
|
94
|
-
wire [{w_het_out-1}:0] packed_out;
|
|
100
|
+
wire [{w_het_in - 1}:0] packed_inp;
|
|
101
|
+
wire [{w_het_out - 1}:0] packed_out;
|
|
95
102
|
|
|
96
103
|
{inp_assignment_str}
|
|
97
104
|
|
|
@@ -106,150 +113,38 @@ endmodule
|
|
|
106
113
|
"""
|
|
107
114
|
|
|
108
115
|
|
|
109
|
-
def
|
|
110
|
-
k_in, i_in, f_in = zip(*map(_minimal_kif, sol.inp_qint))
|
|
111
|
-
k_out, i_out, f_out = zip(*map(_minimal_kif, sol.out_qint))
|
|
112
|
-
max_inp_bw = max(k + i for k, i in zip(k_in, i_in)) + max(f_in)
|
|
113
|
-
max_out_bw = max(k + i for k, i in zip(k_out, i_out)) + max(f_out)
|
|
114
|
-
|
|
115
|
-
n_in, n_out = sol.shape
|
|
116
|
-
return f"""#include "V{module_name}.h"
|
|
117
|
-
#include "ioutils.hh"
|
|
118
|
-
#include <verilated.h>
|
|
119
|
-
|
|
120
|
-
#ifdef _OPENMP
|
|
121
|
-
#include <omp.h>
|
|
122
|
-
constexpr bool _openmp = true;
|
|
123
|
-
#else
|
|
124
|
-
constexpr bool _openmp = false;
|
|
125
|
-
#endif
|
|
126
|
-
|
|
127
|
-
constexpr size_t N_inp = {n_in};
|
|
128
|
-
constexpr size_t N_out = {n_out};
|
|
129
|
-
constexpr size_t max_inp_bw = {max_inp_bw};
|
|
130
|
-
constexpr size_t max_out_bw = {max_out_bw};
|
|
131
|
-
typedef V{module_name} dut_t;
|
|
132
|
-
|
|
133
|
-
extern "C" {{
|
|
134
|
-
|
|
135
|
-
bool openmp_enabled() {{
|
|
136
|
-
return _openmp;
|
|
137
|
-
}}
|
|
138
|
-
|
|
139
|
-
void _inference(int32_t *c_inp, int32_t *c_out, size_t n_samples) {{
|
|
140
|
-
dut_t *dut = new dut_t;
|
|
141
|
-
|
|
142
|
-
for (size_t i = 0; i < n_samples; ++i) {{
|
|
143
|
-
write_input<N_inp, max_inp_bw>(dut->inp, &c_inp[i * N_inp]);
|
|
144
|
-
dut->eval();
|
|
145
|
-
read_output<N_out, max_out_bw>(dut->out, &c_out[i * N_out]);
|
|
146
|
-
}}
|
|
147
|
-
|
|
148
|
-
dut->final();
|
|
149
|
-
delete dut;
|
|
150
|
-
}}
|
|
151
|
-
|
|
152
|
-
void inference(int32_t *c_inp, int32_t *c_out, size_t n_samples) {{
|
|
153
|
-
size_t n_max_threads = omp_get_max_threads();
|
|
154
|
-
size_t n_samples_per_thread = std::max<size_t>(n_samples / n_max_threads, 32);
|
|
155
|
-
size_t n_thread = n_samples / n_samples_per_thread;
|
|
156
|
-
n_thread += (n_samples % n_samples_per_thread) ? 1 : 0;
|
|
157
|
-
|
|
158
|
-
#ifdef _OPENMP
|
|
159
|
-
#pragma omp parallel for num_threads(n_thread) schedule(static)
|
|
160
|
-
for (size_t i = 0; i < n_thread; ++i) {{
|
|
161
|
-
size_t start = i * n_samples_per_thread;
|
|
162
|
-
size_t end = std::min<size_t>(start + n_samples_per_thread, n_samples);
|
|
163
|
-
size_t n_samples_this_thread = end - start;
|
|
164
|
-
|
|
165
|
-
_inference(&c_inp[start * N_inp], &c_out[start * N_out], n_samples_this_thread);
|
|
166
|
-
}}
|
|
167
|
-
#else
|
|
168
|
-
_inference(c_inp, c_out, n_samples);
|
|
169
|
-
#endif
|
|
170
|
-
}}
|
|
171
|
-
}}"""
|
|
172
|
-
|
|
173
|
-
|
|
174
|
-
def pipeline_binder_gen(csol: CascadedSolution, module_name: str, II: int = 1, latency_multiplier: int = 1):
|
|
116
|
+
def binder_gen(csol: CascadedSolution | Solution, module_name: str, II: int = 1, latency_multiplier: int = 1):
|
|
175
117
|
k_in, i_in, f_in = zip(*map(_minimal_kif, csol.inp_qint))
|
|
176
118
|
k_out, i_out, f_out = zip(*map(_minimal_kif, csol.out_qint))
|
|
177
|
-
max_inp_bw = max(
|
|
178
|
-
max_out_bw = max(
|
|
179
|
-
|
|
180
|
-
|
|
119
|
+
max_inp_bw = max(k_in) + max(i_in) + max(f_in)
|
|
120
|
+
max_out_bw = max(k_out) + max(i_out) + max(f_out)
|
|
121
|
+
if isinstance(csol, Solution):
|
|
122
|
+
II = latency = 0
|
|
123
|
+
else:
|
|
124
|
+
latency = len(csol.solutions) * latency_multiplier
|
|
181
125
|
|
|
182
126
|
n_in, n_out = csol.shape
|
|
183
|
-
return f"""#include
|
|
184
|
-
#include "
|
|
185
|
-
#include
|
|
186
|
-
|
|
187
|
-
|
|
188
|
-
|
|
189
|
-
|
|
190
|
-
|
|
191
|
-
|
|
192
|
-
|
|
193
|
-
|
|
194
|
-
|
|
195
|
-
|
|
196
|
-
constexpr size_t max_inp_bw = {max_inp_bw};
|
|
197
|
-
constexpr size_t max_out_bw = {max_out_bw};
|
|
198
|
-
constexpr size_t II = {II};
|
|
199
|
-
constexpr size_t latency = {latency};
|
|
200
|
-
typedef V{module_name} dut_t;
|
|
127
|
+
return f"""#include <cstddef>
|
|
128
|
+
#include "binder_util.hh"
|
|
129
|
+
#include "V{module_name}.h"
|
|
130
|
+
|
|
131
|
+
struct {module_name}_config {{
|
|
132
|
+
static const size_t N_inp = {n_in};
|
|
133
|
+
static const size_t N_out = {n_out};
|
|
134
|
+
static const size_t max_inp_bw = {max_inp_bw};
|
|
135
|
+
static const size_t max_out_bw = {max_out_bw};
|
|
136
|
+
static const size_t II = {II};
|
|
137
|
+
static const size_t latency = {latency};
|
|
138
|
+
typedef V{module_name} dut_t;
|
|
139
|
+
}};
|
|
201
140
|
|
|
202
141
|
extern "C" {{
|
|
203
|
-
|
|
204
142
|
bool openmp_enabled() {{
|
|
205
143
|
return _openmp;
|
|
206
144
|
}}
|
|
207
145
|
|
|
208
|
-
void _inference(int32_t *c_inp, int32_t *c_out, size_t n_samples) {{
|
|
209
|
-
dut_t *dut = new dut_t;
|
|
210
|
-
|
|
211
|
-
size_t clk_req = n_samples * II + latency + 1;
|
|
212
|
-
|
|
213
|
-
for (size_t t_inp = 0; t_inp < clk_req; ++t_inp) {{
|
|
214
|
-
size_t t_out = t_inp - latency - 1;
|
|
215
|
-
|
|
216
|
-
if (t_inp < n_samples * II && t_inp % II == 0) {{
|
|
217
|
-
write_input<N_inp, max_inp_bw>(dut->inp, &c_inp[t_inp / II * N_inp]);
|
|
218
|
-
}}
|
|
219
|
-
|
|
220
|
-
dut->clk = 0;
|
|
221
|
-
dut->eval();
|
|
222
|
-
|
|
223
|
-
if (t_inp > latency && t_out % II == 0) {{
|
|
224
|
-
read_output<N_out, max_out_bw>(dut->out, &c_out[t_out / II * N_out]);
|
|
225
|
-
}}
|
|
226
|
-
|
|
227
|
-
dut->clk = 1;
|
|
228
|
-
dut->eval();
|
|
229
|
-
}}
|
|
230
|
-
|
|
231
|
-
dut->final();
|
|
232
|
-
delete dut;
|
|
233
|
-
}}
|
|
234
|
-
|
|
235
146
|
void inference(int32_t *c_inp, int32_t *c_out, size_t n_samples) {{
|
|
236
|
-
|
|
237
|
-
size_t n_max_threads = omp_get_max_threads();
|
|
238
|
-
size_t n_samples_per_thread = std::max<size_t>(n_samples / n_max_threads, 32);
|
|
239
|
-
size_t n_thread = n_samples / n_samples_per_thread;
|
|
240
|
-
n_thread += (n_samples % n_samples_per_thread) ? 1 : 0;
|
|
241
|
-
|
|
242
|
-
#pragma omp parallel for num_threads(n_thread) schedule(static)
|
|
243
|
-
for (size_t i = 0; i < n_thread; ++i) {{
|
|
244
|
-
size_t start = i * n_samples_per_thread;
|
|
245
|
-
size_t end = std::min<size_t>(start + n_samples_per_thread, n_samples);
|
|
246
|
-
size_t n_samples_this_thread = end - start;
|
|
247
|
-
|
|
248
|
-
_inference(&c_inp[start * N_inp], &c_out[start * N_out], n_samples_this_thread);
|
|
249
|
-
}}
|
|
250
|
-
#else
|
|
251
|
-
_inference(c_inp, c_out, n_samples);
|
|
252
|
-
#endif
|
|
147
|
+
batch_inference<{module_name}_config>(c_inp, c_out, n_samples);
|
|
253
148
|
}}
|
|
254
|
-
|
|
255
|
-
|
|
149
|
+
}}
|
|
150
|
+
"""
|
|
@@ -0,0 +1,72 @@
|
|
|
1
|
+
#include "ioutil.hh"
|
|
2
|
+
#include <verilated.h>
|
|
3
|
+
|
|
4
|
+
#ifdef _OPENMP
|
|
5
|
+
#include <omp.h>
|
|
6
|
+
constexpr bool _openmp = true;
|
|
7
|
+
#else
|
|
8
|
+
constexpr bool _openmp = false;
|
|
9
|
+
#endif
|
|
10
|
+
|
|
11
|
+
template <typename CONFIG_T>
|
|
12
|
+
std::enable_if_t<CONFIG_T::II != 0> _inference(int32_t *c_inp, int32_t *c_out, size_t n_samples) {
|
|
13
|
+
typename CONFIG_T::dut_t *dut = new typename CONFIG_T::dut_t;
|
|
14
|
+
|
|
15
|
+
size_t clk_req = n_samples * CONFIG_T::II + CONFIG_T::latency + 1;
|
|
16
|
+
|
|
17
|
+
for (size_t t_inp = 0; t_inp < clk_req; ++t_inp) {
|
|
18
|
+
size_t t_out = t_inp - CONFIG_T::latency - 1;
|
|
19
|
+
|
|
20
|
+
if (t_inp < n_samples * CONFIG_T::II && t_inp % CONFIG_T::II == 0) {
|
|
21
|
+
write_input<CONFIG_T::N_inp, CONFIG_T::max_inp_bw>(dut->inp, &c_inp[t_inp / CONFIG_T::II * CONFIG_T::N_inp]);
|
|
22
|
+
}
|
|
23
|
+
|
|
24
|
+
dut->clk = 0;
|
|
25
|
+
dut->eval();
|
|
26
|
+
|
|
27
|
+
if (t_inp > CONFIG_T::latency && t_out % CONFIG_T::II == 0) {
|
|
28
|
+
read_output<CONFIG_T::N_out, CONFIG_T::max_out_bw>(dut->out, &c_out[t_out / CONFIG_T::II * CONFIG_T::N_out]);
|
|
29
|
+
}
|
|
30
|
+
|
|
31
|
+
dut->clk = 1;
|
|
32
|
+
dut->eval();
|
|
33
|
+
}
|
|
34
|
+
|
|
35
|
+
dut->final();
|
|
36
|
+
delete dut;
|
|
37
|
+
}
|
|
38
|
+
|
|
39
|
+
template <typename CONFIG_T>
|
|
40
|
+
std::enable_if_t<CONFIG_T::II == 0> _inference(int32_t *c_inp, int32_t *c_out, size_t n_samples) {
|
|
41
|
+
typename CONFIG_T::dut_t *dut = new typename CONFIG_T::dut_t;
|
|
42
|
+
|
|
43
|
+
for (size_t i = 0; i < n_samples; ++i) {
|
|
44
|
+
write_input<CONFIG_T::N_inp, CONFIG_T::max_inp_bw>(dut->inp, &c_inp[i * CONFIG_T::N_inp]);
|
|
45
|
+
dut->eval();
|
|
46
|
+
read_output<CONFIG_T::N_out, CONFIG_T::max_out_bw>(dut->out, &c_out[i * CONFIG_T::N_out]);
|
|
47
|
+
}
|
|
48
|
+
|
|
49
|
+
dut->final();
|
|
50
|
+
delete dut;
|
|
51
|
+
}
|
|
52
|
+
|
|
53
|
+
template <typename CONFIG_T> void batch_inference(int32_t *c_inp, int32_t *c_out, size_t n_samples) {
|
|
54
|
+
#ifdef _OPENMP
|
|
55
|
+
size_t n_max_threads = omp_get_max_threads();
|
|
56
|
+
size_t n_samples_per_thread = std::max<size_t>(n_samples / n_max_threads, 32);
|
|
57
|
+
size_t n_thread = n_samples / n_samples_per_thread;
|
|
58
|
+
n_thread += (n_samples % n_samples_per_thread) ? 1 : 0;
|
|
59
|
+
|
|
60
|
+
#pragma omp parallel for num_threads(n_thread) schedule(static)
|
|
61
|
+
for (size_t i = 0; i < n_thread; ++i) {
|
|
62
|
+
size_t start = i * n_samples_per_thread;
|
|
63
|
+
size_t end = std::min<size_t>(start + n_samples_per_thread, n_samples);
|
|
64
|
+
size_t n_samples_this_thread = end - start;
|
|
65
|
+
size_t offset_in = start * CONFIG_T::N_inp;
|
|
66
|
+
size_t offset_out = start * CONFIG_T::N_out;
|
|
67
|
+
_inference<CONFIG_T>(&c_inp[offset_in], &c_out[offset_out], n_samples_this_thread);
|
|
68
|
+
}
|
|
69
|
+
#else
|
|
70
|
+
_inference<CONFIG_T>(c_inp, c_out, n_samples);
|
|
71
|
+
#endif
|
|
72
|
+
}
|
|
@@ -0,0 +1,58 @@
|
|
|
1
|
+
`timescale 1ns / 1ps
|
|
2
|
+
|
|
3
|
+
|
|
4
|
+
module mux #(
|
|
5
|
+
parameter BW_INPUT0 = 32,
|
|
6
|
+
parameter BW_INPUT1 = 32,
|
|
7
|
+
parameter SIGNED0 = 0,
|
|
8
|
+
parameter SIGNED1 = 0,
|
|
9
|
+
parameter BW_OUT = 32,
|
|
10
|
+
parameter SHIFT1 = 0,
|
|
11
|
+
parameter INVERT1 = 0
|
|
12
|
+
) (
|
|
13
|
+
input key,
|
|
14
|
+
input [BW_INPUT0-1:0] in0,
|
|
15
|
+
input [BW_INPUT1-1:0] in1,
|
|
16
|
+
output [BW_OUT-1:0] out
|
|
17
|
+
);
|
|
18
|
+
|
|
19
|
+
localparam IN0_NEED_BITS = (SHIFT1 < 0) ? BW_INPUT0 - SHIFT1 : BW_INPUT0;
|
|
20
|
+
localparam IN1_NEED_BITS = (SHIFT1 > 0) ? BW_INPUT1 + SHIFT1 : BW_INPUT1;
|
|
21
|
+
localparam EXTRA_PAD = (SIGNED0 != SIGNED1) ? INVERT1 + 1 : INVERT1 + 0;
|
|
22
|
+
localparam BW_BUF = (IN0_NEED_BITS > IN1_NEED_BITS) ? IN0_NEED_BITS + EXTRA_PAD : IN1_NEED_BITS + EXTRA_PAD;
|
|
23
|
+
localparam IN0_PAD_LEFT = (SHIFT1 < 0) ? BW_BUF - BW_INPUT0 + SHIFT1 : BW_BUF - BW_INPUT0;
|
|
24
|
+
localparam IN0_PAD_RIGHT = (SHIFT1 < 0) ? -SHIFT1 : 0;
|
|
25
|
+
localparam IN1_PAD_LEFT = (SHIFT1 > 0) ? BW_BUF - BW_INPUT1 - SHIFT1 : BW_BUF - BW_INPUT1;
|
|
26
|
+
localparam IN1_PAD_RIGHT = (SHIFT1 > 0) ? SHIFT1 : 0;
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
// verilator lint_off UNUSEDSIGNAL
|
|
30
|
+
wire [BW_BUF-1:0] in0_ext;
|
|
31
|
+
wire [BW_BUF-1:0] in1_ext;
|
|
32
|
+
// verilator lint_on UNUSEDSIGNAL
|
|
33
|
+
|
|
34
|
+
generate
|
|
35
|
+
if (SIGNED0 == 1) begin : in0_is_signed
|
|
36
|
+
assign in0_ext = {{IN0_PAD_LEFT{in0[BW_INPUT0-1]}}, in0, {IN0_PAD_RIGHT{1'b0}}};
|
|
37
|
+
end else begin : in0_is_unsigned
|
|
38
|
+
assign in0_ext = {{IN0_PAD_LEFT{1'b0}}, in0, {IN0_PAD_RIGHT{1'b0}}};
|
|
39
|
+
end
|
|
40
|
+
endgenerate
|
|
41
|
+
|
|
42
|
+
generate
|
|
43
|
+
if (SIGNED1 == 1) begin : in1_is_signed
|
|
44
|
+
assign in1_ext = {{IN1_PAD_LEFT{in1[BW_INPUT1-1]}}, in1, {IN1_PAD_RIGHT{1'b0}}};
|
|
45
|
+
end else begin : in1_is_unsigned
|
|
46
|
+
assign in1_ext = {{IN1_PAD_LEFT{1'b0}}, in1, {IN1_PAD_RIGHT{1'b0}}};
|
|
47
|
+
end
|
|
48
|
+
endgenerate
|
|
49
|
+
|
|
50
|
+
generate
|
|
51
|
+
if (INVERT1 == 1) begin : is_invert
|
|
52
|
+
assign out = (key) ? in0_ext[BW_OUT-1:0] : -in1_ext[BW_OUT-1:0];
|
|
53
|
+
end else begin : is_not_invert
|
|
54
|
+
assign out = (key) ? in0_ext[BW_OUT-1:0] : in1_ext[BW_OUT-1:0];
|
|
55
|
+
end
|
|
56
|
+
endgenerate
|
|
57
|
+
|
|
58
|
+
endmodule
|
|
@@ -0,0 +1,28 @@
|
|
|
1
|
+
`timescale 1ns / 1ps
|
|
2
|
+
|
|
3
|
+
|
|
4
|
+
module negative #(
|
|
5
|
+
parameter BW_IN = 32,
|
|
6
|
+
parameter BW_OUT = 32,
|
|
7
|
+
parameter IN_SIGNED = 0
|
|
8
|
+
) (
|
|
9
|
+
// verilator lint_off UNUSEDSIGNAL
|
|
10
|
+
input [ BW_IN-1:0] in,
|
|
11
|
+
// verilator lint_off UNUSEDSIGNAL
|
|
12
|
+
output [BW_OUT-1:0] out
|
|
13
|
+
);
|
|
14
|
+
generate
|
|
15
|
+
if (BW_IN < BW_OUT) begin : in_is_smaller
|
|
16
|
+
wire [BW_OUT-1:0] in_ext;
|
|
17
|
+
if (IN_SIGNED == 1) begin : is_signed
|
|
18
|
+
assign in_ext = {{BW_OUT - BW_IN{in[BW_IN-1]}}, in};
|
|
19
|
+
end else begin : is_unsigned
|
|
20
|
+
assign in_ext = {{BW_OUT - BW_IN{1'b0}}, in};
|
|
21
|
+
end
|
|
22
|
+
assign out = -in_ext;
|
|
23
|
+
end else begin : in_is_bigger
|
|
24
|
+
assign out = -in[BW_OUT-1:0];
|
|
25
|
+
end
|
|
26
|
+
endgenerate
|
|
27
|
+
|
|
28
|
+
endmodule
|
|
@@ -17,7 +17,7 @@ module shift_adder #(
|
|
|
17
17
|
|
|
18
18
|
localparam IN0_NEED_BITS = (SHIFT1 < 0) ? BW_INPUT0 - SHIFT1 : BW_INPUT0;
|
|
19
19
|
localparam IN1_NEED_BITS = (SHIFT1 > 0) ? BW_INPUT1 + SHIFT1 : BW_INPUT1;
|
|
20
|
-
localparam EXTRA_PAD = (SIGNED0 != SIGNED1) ? IS_SUB+1 : IS_SUB+0;
|
|
20
|
+
localparam EXTRA_PAD = (SIGNED0 != SIGNED1) ? IS_SUB + 1 : IS_SUB + 0;
|
|
21
21
|
localparam BW_ADD = (IN0_NEED_BITS > IN1_NEED_BITS) ? IN0_NEED_BITS + EXTRA_PAD + 1 : IN1_NEED_BITS + EXTRA_PAD + 1;
|
|
22
22
|
localparam IN0_PAD_LEFT = (SHIFT1 < 0) ? BW_ADD - BW_INPUT0 + SHIFT1 : BW_ADD - BW_INPUT0;
|
|
23
23
|
localparam IN0_PAD_RIGHT = (SHIFT1 < 0) ? -SHIFT1 : 0;
|
|
@@ -37,6 +37,9 @@ module shift_adder #(
|
|
|
37
37
|
end else begin : in0_is_unsigned
|
|
38
38
|
assign in0_ext = {{IN0_PAD_LEFT{1'b0}}, in0, {IN0_PAD_RIGHT{1'b0}}};
|
|
39
39
|
end
|
|
40
|
+
endgenerate
|
|
41
|
+
|
|
42
|
+
generate
|
|
40
43
|
if (SIGNED1 == 1) begin : in1_is_signed
|
|
41
44
|
assign in1_ext = {{IN1_PAD_LEFT{in1[BW_INPUT1-1]}}, in1, {IN1_PAD_RIGHT{1'b0}}};
|
|
42
45
|
end else begin : in1_is_unsigned
|
|
@@ -27,3 +27,6 @@ set_clock_uncertainty -setup $uncertainty_setup [get_clocks sys_clk]
|
|
|
27
27
|
set_clock_uncertainty -hold $uncertainty_hold [get_clocks sys_clk]
|
|
28
28
|
|
|
29
29
|
set_property HD.CLK_SRC BUFG_X0Y0 [get_ports clk]
|
|
30
|
+
|
|
31
|
+
set_property retiming_forward 1 [get_cells {stage[*]_inp}]
|
|
32
|
+
set_property retiming_backward 1 [get_cells {stage[*]_inp}]
|
|
@@ -13,7 +13,7 @@ from numpy.typing import NDArray
|
|
|
13
13
|
from ... import codegen
|
|
14
14
|
from ...cmvm.types import CascadedSolution, Solution, _minimal_kif
|
|
15
15
|
from ...trace.pipeline import to_pipeline
|
|
16
|
-
from . import
|
|
16
|
+
from . import binder_gen, comb_logic_gen, generate_io_wrapper, pipeline_logic_gen
|
|
17
17
|
|
|
18
18
|
|
|
19
19
|
def get_io_kifs(sol: Solution | CascadedSolution):
|
|
@@ -59,6 +59,7 @@ class VerilogModel:
|
|
|
59
59
|
self._latency_cutoff = latency_cutoff
|
|
60
60
|
|
|
61
61
|
self._lib = None
|
|
62
|
+
self._uuid = None
|
|
62
63
|
|
|
63
64
|
def write(self):
|
|
64
65
|
self._path.mkdir(parents=True, exist_ok=True)
|
|
@@ -89,7 +90,7 @@ class VerilogModel:
|
|
|
89
90
|
f.write(xdc)
|
|
90
91
|
|
|
91
92
|
# C++ binder w/ verilog wrapper for uniform bw
|
|
92
|
-
binder =
|
|
93
|
+
binder = binder_gen(self._pipe, f'{self._prj_name}_wrapper', 1, self._register_layers)
|
|
93
94
|
|
|
94
95
|
# Verilog IO wrapper (non-uniform bw to uniform one, clk passthrough)
|
|
95
96
|
io_wrapper = generate_io_wrapper(self._pipe, self._prj_name, True)
|
|
@@ -105,7 +106,7 @@ class VerilogModel:
|
|
|
105
106
|
|
|
106
107
|
# Verilog IO wrapper (non-uniform bw to uniform one, no clk)
|
|
107
108
|
io_wrapper = generate_io_wrapper(self._solution, self._prj_name, False)
|
|
108
|
-
binder =
|
|
109
|
+
binder = binder_gen(self._solution, f'{self._prj_name}_wrapper')
|
|
109
110
|
|
|
110
111
|
with open(self._path / f'{self._prj_name}_wrapper.v', 'w') as f:
|
|
111
112
|
f.write(io_wrapper)
|
|
@@ -114,13 +115,16 @@ class VerilogModel:
|
|
|
114
115
|
|
|
115
116
|
# Common resource copy
|
|
116
117
|
shutil.copy(self.__src_root / 'verilog/source/shift_adder.v', self._path)
|
|
118
|
+
shutil.copy(self.__src_root / 'verilog/source/mux.v', self._path)
|
|
119
|
+
shutil.copy(self.__src_root / 'verilog/source/negative.v', self._path)
|
|
117
120
|
shutil.copy(self.__src_root / 'verilog/source/build_binder.mk', self._path)
|
|
118
|
-
shutil.copy(self.__src_root / 'verilog/source/
|
|
121
|
+
shutil.copy(self.__src_root / 'verilog/source/ioutil.hh', self._path)
|
|
122
|
+
shutil.copy(self.__src_root / 'verilog/source/binder_util.hh', self._path)
|
|
119
123
|
self._solution.save(self._path / 'model.json')
|
|
120
124
|
with open(self._path / 'misc.json', 'w') as f:
|
|
121
125
|
f.write(f'{{"cost": {self._solution.cost}}}')
|
|
122
126
|
|
|
123
|
-
def _compile(self, verbose=False, openmp=True, o3: bool = False, clean=True):
|
|
127
|
+
def _compile(self, verbose=False, openmp=True, nproc=None, o3: bool = False, clean=True):
|
|
124
128
|
"""Same as compile, but will not write to the library
|
|
125
129
|
|
|
126
130
|
Parameters
|
|
@@ -129,6 +133,9 @@ class VerilogModel:
|
|
|
129
133
|
Verbose output, by default False
|
|
130
134
|
openmp : bool, optional
|
|
131
135
|
Enable openmp, by default True
|
|
136
|
+
nproc : int | None, optional
|
|
137
|
+
Number of processes to use for compilation, by default None
|
|
138
|
+
If None, will use the number of CPU cores, but not more than 32.
|
|
132
139
|
o3 : bool | None, optional
|
|
133
140
|
Turn on -O3 flag, by default False
|
|
134
141
|
clean : bool, optional
|
|
@@ -146,14 +153,20 @@ class VerilogModel:
|
|
|
146
153
|
env['VM_PREFIX'] = f'{self._prj_name}_wrapper'
|
|
147
154
|
env['STAMP'] = self._uuid
|
|
148
155
|
env['EXTRA_CXXFLAGS'] = '-fopenmp' if openmp else ''
|
|
156
|
+
if nproc is not None:
|
|
157
|
+
env['N_JOBS'] = str(nproc)
|
|
149
158
|
if o3:
|
|
150
159
|
args.append('fast')
|
|
151
160
|
|
|
152
|
-
if clean:
|
|
161
|
+
if clean is not False:
|
|
153
162
|
m = re.compile(r'^lib.*[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}\.so$')
|
|
154
163
|
for p in self._path.iterdir():
|
|
155
164
|
if not p.is_dir() and m.match(p.name):
|
|
156
165
|
p.unlink()
|
|
166
|
+
if clean:
|
|
167
|
+
subprocess.run(
|
|
168
|
+
['make', '-f', 'build_binder.mk', 'clean'], env=env, cwd=self._path, check=True, capture_output=not verbose
|
|
169
|
+
)
|
|
157
170
|
|
|
158
171
|
try:
|
|
159
172
|
r = subprocess.run(args, env=env, check=True, cwd=self._path, capture_output=not verbose)
|
|
@@ -170,13 +183,19 @@ class VerilogModel:
|
|
|
170
183
|
|
|
171
184
|
def _load_lib(self, uuid: str | None = None):
|
|
172
185
|
uuid = uuid if uuid is not None else self._uuid
|
|
186
|
+
if uuid is None:
|
|
187
|
+
# load .so if there is only one, otherwise raise an error
|
|
188
|
+
libs = list(self._path.glob(f'lib{self._prj_name}_wrapper_*.so'))
|
|
189
|
+
if len(libs) == 0:
|
|
190
|
+
raise RuntimeError(f'Cannot load library, found {len(libs)} libraries in {self._path}')
|
|
191
|
+
uuid = libs[0].name.split('_')[-1].split('.', 1)[0]
|
|
173
192
|
self._uuid = uuid
|
|
174
193
|
lib_path = self._path / f'lib{self._prj_name}_wrapper_{uuid}.so'
|
|
175
194
|
if not lib_path.exists():
|
|
176
195
|
raise RuntimeError(f'Library {lib_path} does not exist')
|
|
177
196
|
self._lib = ctypes.CDLL(str(lib_path))
|
|
178
197
|
|
|
179
|
-
def compile(self, verbose=False, openmp=True, o3: bool = False):
|
|
198
|
+
def compile(self, verbose=False, openmp=True, nproc: int | None = None, o3: bool = False, clean=True):
|
|
180
199
|
"""Compile the generated code to a emulator for logic simulation.
|
|
181
200
|
|
|
182
201
|
Parameters
|
|
@@ -185,8 +204,13 @@ class VerilogModel:
|
|
|
185
204
|
Verbose output, by default False
|
|
186
205
|
openmp : bool, optional
|
|
187
206
|
Enable openmp, by default True
|
|
207
|
+
nproc : int | None, optional
|
|
208
|
+
Number of processes to use for compilation, by default None
|
|
209
|
+
If None, will use the number of CPU cores, but not more than 32.
|
|
188
210
|
o3 : bool | None, optional
|
|
189
211
|
Turn on -O3 flag, by default False
|
|
212
|
+
clean : bool, optional
|
|
213
|
+
Remove obsolete shared object files, by default True
|
|
190
214
|
|
|
191
215
|
Raises
|
|
192
216
|
------
|
|
@@ -194,8 +218,7 @@ class VerilogModel:
|
|
|
194
218
|
If compilation fails
|
|
195
219
|
"""
|
|
196
220
|
self.write()
|
|
197
|
-
self._compile(verbose=verbose, openmp=openmp, o3=o3)
|
|
198
|
-
self._load_lib()
|
|
221
|
+
self._compile(verbose=verbose, openmp=openmp, nproc=nproc, o3=o3, clean=clean)
|
|
199
222
|
|
|
200
223
|
def predict(self, data: NDArray[np.floating]):
|
|
201
224
|
"""Run the model on the input data.
|
|
@@ -227,7 +250,7 @@ class VerilogModel:
|
|
|
227
250
|
out_data = np.empty(n_sample * out_size, dtype=np.int32)
|
|
228
251
|
|
|
229
252
|
# Convert to int32 matching the LSB position
|
|
230
|
-
inp_data[:] = data.ravel() * 2.0
|
|
253
|
+
inp_data[:] = np.floor(data.ravel() * 2.0**f_in)
|
|
231
254
|
|
|
232
255
|
inp_buf = inp_data.ctypes.data_as(ctypes.POINTER(ctypes.c_int32))
|
|
233
256
|
out_buf = out_data.ctypes.data_as(ctypes.POINTER(ctypes.c_int32))
|
|
@@ -235,7 +258,7 @@ class VerilogModel:
|
|
|
235
258
|
|
|
236
259
|
# Unscale the output int32 to recover fp values
|
|
237
260
|
k, i, f = np.max(k_out), np.max(i_out), np.max(f_out)
|
|
238
|
-
a, b, c = 2.0 ** (k + i + f), 2.0 ** (i + f), 2.0**-f
|
|
261
|
+
a, b, c = 2.0 ** (k + i + f), k * 2.0 ** (i + f), 2.0**-f
|
|
239
262
|
return ((out_data.reshape(n_sample, out_size) + b) % a - b) * c
|
|
240
263
|
|
|
241
264
|
def __repr__(self):
|
|
@@ -261,7 +284,8 @@ Estimated cost: {cost} LUTs"""
|
|
|
261
284
|
|
|
262
285
|
is_compiled = self._lib is not None
|
|
263
286
|
if is_compiled:
|
|
264
|
-
|
|
287
|
+
assert self._uuid is not None
|
|
288
|
+
openmp = 'with OpenMP' if self._lib.openmp_enabled() else '' # type: ignore
|
|
265
289
|
spec += f'\nEmulator is compiled {openmp} ({self._uuid[-12:]})'
|
|
266
290
|
else:
|
|
267
291
|
spec += '\nEmulator is **not compiled**'
|
|
File without changes
|
|
@@ -0,0 +1,105 @@
|
|
|
1
|
+
from collections.abc import Sequence
|
|
2
|
+
from dataclasses import dataclass
|
|
3
|
+
from typing import Any
|
|
4
|
+
|
|
5
|
+
import keras
|
|
6
|
+
from keras import KerasTensor, Operation
|
|
7
|
+
|
|
8
|
+
from ...trace import FixedVariableArray, HWConfig
|
|
9
|
+
from ...trace.fixed_variable_array import FixedVariableArrayInput
|
|
10
|
+
from .replica import _registry
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
@dataclass
|
|
14
|
+
class OpObj:
|
|
15
|
+
operation: Operation
|
|
16
|
+
args: list
|
|
17
|
+
kwargs: dict
|
|
18
|
+
produces: tuple[KerasTensor, ...]
|
|
19
|
+
requires: tuple[KerasTensor, ...]
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
def parse_model(model: keras.Model):
|
|
23
|
+
operators: dict[int, list[OpObj]] = {}
|
|
24
|
+
for depth, nodes in model._nodes_by_depth.items():
|
|
25
|
+
_oprs = []
|
|
26
|
+
for node in nodes:
|
|
27
|
+
assert isinstance(node.operation, keras.Operation)
|
|
28
|
+
opr = OpObj(
|
|
29
|
+
operation=node.operation,
|
|
30
|
+
args=node.arguments.args,
|
|
31
|
+
kwargs=node.arguments.kwargs,
|
|
32
|
+
produces=node.outputs,
|
|
33
|
+
requires=node.arguments.keras_tensors,
|
|
34
|
+
)
|
|
35
|
+
_oprs.append(opr)
|
|
36
|
+
operators[depth] = _oprs
|
|
37
|
+
return [operators[i] for i in range(max(operators.keys()), -1, -1)]
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
def replace_tensors(tensor_map: dict[KerasTensor, FixedVariableArray], obj: Any) -> Any:
|
|
41
|
+
if isinstance(obj, KerasTensor):
|
|
42
|
+
return tensor_map[obj]
|
|
43
|
+
if isinstance(obj, list):
|
|
44
|
+
return [replace_tensors(tensor_map, o) for o in obj]
|
|
45
|
+
if isinstance(obj, tuple):
|
|
46
|
+
return tuple(replace_tensors(tensor_map, o) for o in obj)
|
|
47
|
+
if isinstance(obj, dict):
|
|
48
|
+
return {k: replace_tensors(tensor_map, v) for k, v in obj.items()}
|
|
49
|
+
return obj
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
def _apply_nn(
|
|
53
|
+
model: keras.Model, inputs: FixedVariableArray | Sequence[FixedVariableArray], verbose: bool = False
|
|
54
|
+
) -> tuple[FixedVariableArray, ...]:
|
|
55
|
+
"""
|
|
56
|
+
Apply a keras model to a fixed variable array or a sequence of fixed variable arrays.
|
|
57
|
+
|
|
58
|
+
Parameters
|
|
59
|
+
----------
|
|
60
|
+
model : keras.Model
|
|
61
|
+
The keras model to apply.
|
|
62
|
+
inputs : FixedVariableArray or Sequence[FixedVariableArray]
|
|
63
|
+
The input fixed variable array or sequence of fixed variable arrays.
|
|
64
|
+
|
|
65
|
+
Returns
|
|
66
|
+
-------
|
|
67
|
+
tuple of FixedVariableArray
|
|
68
|
+
A tuple containing the output(s) of the model as FixedVariableArray.
|
|
69
|
+
"""
|
|
70
|
+
if isinstance(inputs, FixedVariableArray):
|
|
71
|
+
inputs = (inputs,)
|
|
72
|
+
|
|
73
|
+
assert len(model.inputs) == len(inputs), f'Model has {len(model.inputs)} inputs, got {len(inputs)}'
|
|
74
|
+
tensor_map = {keras_tensor: da_tensor for keras_tensor, da_tensor in zip(model.inputs, inputs)}
|
|
75
|
+
|
|
76
|
+
for ops in parse_model(model):
|
|
77
|
+
for op in ops:
|
|
78
|
+
assert all(t in tensor_map for t in op.requires)
|
|
79
|
+
args = replace_tensors(tensor_map, op.args)
|
|
80
|
+
kwargs: dict[str, Any] = replace_tensors(tensor_map, op.kwargs)
|
|
81
|
+
if op.operation.__class__ is keras.layers.InputLayer:
|
|
82
|
+
continue
|
|
83
|
+
mirror_op = _registry[op.operation.__class__](op.operation)
|
|
84
|
+
if verbose:
|
|
85
|
+
print(f'Processing operation {op.operation.name} ({op.operation.__class__.__name__})')
|
|
86
|
+
outputs = mirror_op(*args, **kwargs)
|
|
87
|
+
for keras_tensor, da_tensor in zip(op.produces, outputs):
|
|
88
|
+
tensor_map[keras_tensor] = da_tensor
|
|
89
|
+
|
|
90
|
+
return tuple(tensor_map[keras_tensor] for keras_tensor in model.outputs)
|
|
91
|
+
|
|
92
|
+
|
|
93
|
+
def trace_model(
|
|
94
|
+
model: keras.Model,
|
|
95
|
+
hwconf: HWConfig = HWConfig(1, -1, -1),
|
|
96
|
+
solver_options: dict[str, Any] | None = None,
|
|
97
|
+
verbose: bool = False,
|
|
98
|
+
inputs: tuple[FixedVariableArray, ...] | None = None,
|
|
99
|
+
) -> tuple[tuple[FixedVariableArray, ...], tuple[FixedVariableArray, ...]]:
|
|
100
|
+
if inputs is None:
|
|
101
|
+
inputs = tuple(
|
|
102
|
+
FixedVariableArrayInput(inp.shape[1:], hwconf=hwconf, solver_options=solver_options) for inp in model.inputs
|
|
103
|
+
)
|
|
104
|
+
outputs = _apply_nn(model, inputs, verbose=verbose)
|
|
105
|
+
return inputs, outputs
|