da4ml 0.2.0__py3-none-any.whl → 0.3.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of da4ml might be problematic. Click here for more details.
- da4ml/_version.py +2 -2
- da4ml/cmvm/api.py +2 -6
- da4ml/cmvm/core/__init__.py +0 -1
- da4ml/cmvm/types.py +99 -19
- da4ml/codegen/__init__.py +5 -4
- da4ml/codegen/cpp/__init__.py +2 -1
- da4ml/codegen/cpp/cpp_codegen.py +58 -25
- da4ml/codegen/cpp/hls_model.py +252 -0
- da4ml/codegen/cpp/source/ap_types/ap_binary.h +78 -0
- da4ml/codegen/cpp/source/ap_types/ap_common.h +376 -0
- da4ml/codegen/cpp/source/ap_types/ap_decl.h +212 -0
- da4ml/codegen/cpp/source/ap_types/ap_fixed.h +360 -0
- da4ml/codegen/cpp/source/ap_types/ap_fixed_base.h +2354 -0
- da4ml/codegen/cpp/source/ap_types/ap_fixed_ref.h +718 -0
- da4ml/codegen/cpp/source/ap_types/ap_fixed_special.h +230 -0
- da4ml/codegen/cpp/source/ap_types/ap_int.h +330 -0
- da4ml/codegen/cpp/source/ap_types/ap_int_base.h +1885 -0
- da4ml/codegen/cpp/source/ap_types/ap_int_ref.h +1346 -0
- da4ml/codegen/cpp/source/ap_types/ap_int_special.h +223 -0
- da4ml/codegen/cpp/source/ap_types/ap_shift_reg.h +138 -0
- da4ml/codegen/cpp/source/ap_types/etc/ap_private.h +7199 -0
- da4ml/codegen/cpp/source/ap_types/hls_math.h +27 -0
- da4ml/codegen/cpp/source/ap_types/hls_stream.h +263 -0
- da4ml/codegen/cpp/source/ap_types/utils/x_hls_utils.h +80 -0
- da4ml/codegen/cpp/source/binder_util.hh +56 -0
- da4ml/codegen/cpp/source/build_binder.mk +24 -0
- da4ml/codegen/cpp/source/{vitis.h → vitis_bitshift.hh} +1 -1
- da4ml/codegen/verilog/__init__.py +2 -3
- da4ml/codegen/verilog/comb.py +65 -24
- da4ml/codegen/verilog/io_wrapper.py +36 -141
- da4ml/codegen/verilog/pipeline.py +21 -3
- da4ml/codegen/verilog/source/binder_util.hh +72 -0
- da4ml/codegen/verilog/source/build_prj.tcl +0 -1
- da4ml/codegen/verilog/source/mux.v +58 -0
- da4ml/codegen/verilog/source/negative.v +28 -0
- da4ml/codegen/verilog/source/shift_adder.v +4 -1
- da4ml/codegen/verilog/source/template.xdc +3 -0
- da4ml/codegen/verilog/verilog_model.py +42 -15
- da4ml/converter/__init__.py +0 -0
- da4ml/converter/hgq2/parser.py +105 -0
- da4ml/converter/hgq2/replica.py +383 -0
- da4ml/trace/__init__.py +2 -2
- da4ml/trace/fixed_variable.py +177 -18
- da4ml/trace/fixed_variable_array.py +124 -9
- da4ml/trace/ops/__init__.py +22 -6
- da4ml/trace/ops/conv_utils.py +146 -14
- da4ml/trace/ops/einsum_utils.py +9 -6
- da4ml/trace/ops/reduce_utils.py +103 -0
- da4ml/trace/pipeline.py +36 -34
- da4ml/trace/tracer.py +37 -5
- da4ml-0.3.0.dist-info/METADATA +107 -0
- da4ml-0.3.0.dist-info/RECORD +64 -0
- da4ml/codegen/cpp/source/vitis_bridge.h +0 -17
- da4ml-0.2.0.dist-info/METADATA +0 -65
- da4ml-0.2.0.dist-info/RECORD +0 -39
- /da4ml/codegen/verilog/source/{ioutils.hh → ioutil.hh} +0 -0
- {da4ml-0.2.0.dist-info → da4ml-0.3.0.dist-info}/WHEEL +0 -0
- {da4ml-0.2.0.dist-info → da4ml-0.3.0.dist-info}/licenses/LICENSE +0 -0
- {da4ml-0.2.0.dist-info → da4ml-0.3.0.dist-info}/top_level.txt +0 -0
|
@@ -7,7 +7,7 @@ def hetero_io_map(qints: list[QInterval], merge: bool = False):
|
|
|
7
7
|
N = len(qints)
|
|
8
8
|
ks, _is, fs = zip(*map(_minimal_kif, qints))
|
|
9
9
|
Is = [_i + _k for _i, _k in zip(_is, ks)]
|
|
10
|
-
max_I, max_f = max(
|
|
10
|
+
max_I, max_f = max(_is) + max(ks), max(fs)
|
|
11
11
|
max_bw = max_I + max_f
|
|
12
12
|
width_regular, width_packed = max_bw * N, sum(Is) + sum(fs)
|
|
13
13
|
|
|
@@ -32,11 +32,16 @@ def hetero_io_map(qints: list[QInterval], merge: bool = False):
|
|
|
32
32
|
copy_from = hetero[i][0] if ks[i] else -1
|
|
33
33
|
pads.append((base + max_bw - 1, base + max_bw - bias_high, copy_from))
|
|
34
34
|
|
|
35
|
+
mask = list(high < low for high, low in hetero)
|
|
36
|
+
regular = [r for r, m in zip(regular, mask) if not m]
|
|
37
|
+
hetero = [h for h, m in zip(hetero, mask) if not m]
|
|
38
|
+
|
|
35
39
|
if not merge:
|
|
36
40
|
return regular, hetero, pads, (width_regular, width_packed)
|
|
37
41
|
|
|
38
42
|
# Merging consecutive intervals when possible
|
|
39
|
-
|
|
43
|
+
NN = len(regular) - 2
|
|
44
|
+
for i in range(NN, -1, -1):
|
|
40
45
|
this_high = regular[i][0]
|
|
41
46
|
next_low = regular[i + 1][1]
|
|
42
47
|
if next_low - this_high != 1:
|
|
@@ -65,6 +70,8 @@ def generate_io_wrapper(sol: Solution | CascadedSolution, module_name: str, pipe
|
|
|
65
70
|
_out_assignment: list[tuple[int, str]] = []
|
|
66
71
|
|
|
67
72
|
for i, ((ih, jh), (ir, jr)) in enumerate(zip(het_out, reg_out)):
|
|
73
|
+
if ih == jh - 1:
|
|
74
|
+
continue
|
|
68
75
|
_out_assignment.append((ih, f'assign out[{ir}:{jr}] = packed_out[{ih}:{jh}];'))
|
|
69
76
|
|
|
70
77
|
for i, (i, j, copy_from) in enumerate(pad_out):
|
|
@@ -86,12 +93,12 @@ def generate_io_wrapper(sol: Solution | CascadedSolution, module_name: str, pipe
|
|
|
86
93
|
|
|
87
94
|
module {module_name}_wrapper ({clk_and_rst_inp}
|
|
88
95
|
// verilator lint_off UNUSEDSIGNAL
|
|
89
|
-
input [{w_reg_in-1}:0] inp,
|
|
96
|
+
input [{w_reg_in - 1}:0] inp,
|
|
90
97
|
// verilator lint_on UNUSEDSIGNAL
|
|
91
|
-
output [{w_reg_out-1}:0] out
|
|
98
|
+
output [{w_reg_out - 1}:0] out
|
|
92
99
|
);
|
|
93
|
-
wire [{w_het_in-1}:0] packed_inp;
|
|
94
|
-
wire [{w_het_out-1}:0] packed_out;
|
|
100
|
+
wire [{w_het_in - 1}:0] packed_inp;
|
|
101
|
+
wire [{w_het_out - 1}:0] packed_out;
|
|
95
102
|
|
|
96
103
|
{inp_assignment_str}
|
|
97
104
|
|
|
@@ -106,150 +113,38 @@ endmodule
|
|
|
106
113
|
"""
|
|
107
114
|
|
|
108
115
|
|
|
109
|
-
def
|
|
110
|
-
k_in, i_in, f_in = zip(*map(_minimal_kif, sol.inp_qint))
|
|
111
|
-
k_out, i_out, f_out = zip(*map(_minimal_kif, sol.out_qint))
|
|
112
|
-
max_inp_bw = max(k + i for k, i in zip(k_in, i_in)) + max(f_in)
|
|
113
|
-
max_out_bw = max(k + i for k, i in zip(k_out, i_out)) + max(f_out)
|
|
114
|
-
|
|
115
|
-
n_in, n_out = sol.shape
|
|
116
|
-
return f"""#include "V{module_name}.h"
|
|
117
|
-
#include "ioutils.hh"
|
|
118
|
-
#include <verilated.h>
|
|
119
|
-
|
|
120
|
-
#ifdef _OPENMP
|
|
121
|
-
#include <omp.h>
|
|
122
|
-
constexpr bool _openmp = true;
|
|
123
|
-
#else
|
|
124
|
-
constexpr bool _openmp = false;
|
|
125
|
-
#endif
|
|
126
|
-
|
|
127
|
-
constexpr size_t N_inp = {n_in};
|
|
128
|
-
constexpr size_t N_out = {n_out};
|
|
129
|
-
constexpr size_t max_inp_bw = {max_inp_bw};
|
|
130
|
-
constexpr size_t max_out_bw = {max_out_bw};
|
|
131
|
-
typedef V{module_name} dut_t;
|
|
132
|
-
|
|
133
|
-
extern "C" {{
|
|
134
|
-
|
|
135
|
-
bool openmp_enabled() {{
|
|
136
|
-
return _openmp;
|
|
137
|
-
}}
|
|
138
|
-
|
|
139
|
-
void _inference(int32_t *c_inp, int32_t *c_out, size_t n_samples) {{
|
|
140
|
-
dut_t *dut = new dut_t;
|
|
141
|
-
|
|
142
|
-
for (size_t i = 0; i < n_samples; ++i) {{
|
|
143
|
-
write_input<N_inp, max_inp_bw>(dut->inp, &c_inp[i * N_inp]);
|
|
144
|
-
dut->eval();
|
|
145
|
-
read_output<N_out, max_out_bw>(dut->out, &c_out[i * N_out]);
|
|
146
|
-
}}
|
|
147
|
-
|
|
148
|
-
dut->final();
|
|
149
|
-
delete dut;
|
|
150
|
-
}}
|
|
151
|
-
|
|
152
|
-
void inference(int32_t *c_inp, int32_t *c_out, size_t n_samples) {{
|
|
153
|
-
size_t n_max_threads = omp_get_max_threads();
|
|
154
|
-
size_t n_samples_per_thread = std::max<size_t>(n_samples / n_max_threads, 32);
|
|
155
|
-
size_t n_thread = n_samples / n_samples_per_thread;
|
|
156
|
-
n_thread += (n_samples % n_samples_per_thread) ? 1 : 0;
|
|
157
|
-
|
|
158
|
-
#ifdef _OPENMP
|
|
159
|
-
#pragma omp parallel for num_threads(n_thread) schedule(static)
|
|
160
|
-
for (size_t i = 0; i < n_thread; ++i) {{
|
|
161
|
-
size_t start = i * n_samples_per_thread;
|
|
162
|
-
size_t end = std::min<size_t>(start + n_samples_per_thread, n_samples);
|
|
163
|
-
size_t n_samples_this_thread = end - start;
|
|
164
|
-
|
|
165
|
-
_inference(&c_inp[start * N_inp], &c_out[start * N_out], n_samples_this_thread);
|
|
166
|
-
}}
|
|
167
|
-
#else
|
|
168
|
-
_inference(c_inp, c_out, n_samples);
|
|
169
|
-
#endif
|
|
170
|
-
}}
|
|
171
|
-
}}"""
|
|
172
|
-
|
|
173
|
-
|
|
174
|
-
def pipeline_binder_gen(csol: CascadedSolution, module_name: str, II: int = 1):
|
|
116
|
+
def binder_gen(csol: CascadedSolution | Solution, module_name: str, II: int = 1, latency_multiplier: int = 1):
|
|
175
117
|
k_in, i_in, f_in = zip(*map(_minimal_kif, csol.inp_qint))
|
|
176
118
|
k_out, i_out, f_out = zip(*map(_minimal_kif, csol.out_qint))
|
|
177
|
-
max_inp_bw = max(
|
|
178
|
-
max_out_bw = max(
|
|
179
|
-
|
|
180
|
-
|
|
119
|
+
max_inp_bw = max(k_in) + max(i_in) + max(f_in)
|
|
120
|
+
max_out_bw = max(k_out) + max(i_out) + max(f_out)
|
|
121
|
+
if isinstance(csol, Solution):
|
|
122
|
+
II = latency = 0
|
|
123
|
+
else:
|
|
124
|
+
latency = len(csol.solutions) * latency_multiplier
|
|
181
125
|
|
|
182
126
|
n_in, n_out = csol.shape
|
|
183
|
-
return f"""#include
|
|
184
|
-
#include "
|
|
185
|
-
#include
|
|
186
|
-
|
|
187
|
-
|
|
188
|
-
|
|
189
|
-
|
|
190
|
-
|
|
191
|
-
|
|
192
|
-
|
|
193
|
-
|
|
194
|
-
|
|
195
|
-
|
|
196
|
-
constexpr size_t max_inp_bw = {max_inp_bw};
|
|
197
|
-
constexpr size_t max_out_bw = {max_out_bw};
|
|
198
|
-
constexpr size_t II = {II};
|
|
199
|
-
constexpr size_t latency = {n_stage};
|
|
200
|
-
typedef V{module_name} dut_t;
|
|
127
|
+
return f"""#include <cstddef>
|
|
128
|
+
#include "binder_util.hh"
|
|
129
|
+
#include "V{module_name}.h"
|
|
130
|
+
|
|
131
|
+
struct {module_name}_config {{
|
|
132
|
+
static const size_t N_inp = {n_in};
|
|
133
|
+
static const size_t N_out = {n_out};
|
|
134
|
+
static const size_t max_inp_bw = {max_inp_bw};
|
|
135
|
+
static const size_t max_out_bw = {max_out_bw};
|
|
136
|
+
static const size_t II = {II};
|
|
137
|
+
static const size_t latency = {latency};
|
|
138
|
+
typedef V{module_name} dut_t;
|
|
139
|
+
}};
|
|
201
140
|
|
|
202
141
|
extern "C" {{
|
|
203
|
-
|
|
204
142
|
bool openmp_enabled() {{
|
|
205
143
|
return _openmp;
|
|
206
144
|
}}
|
|
207
145
|
|
|
208
|
-
void _inference(int32_t *c_inp, int32_t *c_out, size_t n_samples) {{
|
|
209
|
-
dut_t *dut = new dut_t;
|
|
210
|
-
|
|
211
|
-
size_t clk_req = n_samples * II + latency + 1;
|
|
212
|
-
|
|
213
|
-
for (size_t t_inp = 0; t_inp < clk_req; ++t_inp) {{
|
|
214
|
-
size_t t_out = t_inp - latency - 1;
|
|
215
|
-
|
|
216
|
-
if (t_inp < n_samples * II && t_inp % II == 0) {{
|
|
217
|
-
write_input<N_inp, max_inp_bw>(dut->inp, &c_inp[t_inp / II * N_inp]);
|
|
218
|
-
}}
|
|
219
|
-
|
|
220
|
-
dut->clk = 0;
|
|
221
|
-
dut->eval();
|
|
222
|
-
|
|
223
|
-
if (t_inp > latency && t_out % II == 0) {{
|
|
224
|
-
read_output<N_out, max_out_bw>(dut->out, &c_out[t_out / II * N_out]);
|
|
225
|
-
}}
|
|
226
|
-
|
|
227
|
-
dut->clk = 1;
|
|
228
|
-
dut->eval();
|
|
229
|
-
}}
|
|
230
|
-
|
|
231
|
-
dut->final();
|
|
232
|
-
delete dut;
|
|
233
|
-
}}
|
|
234
|
-
|
|
235
146
|
void inference(int32_t *c_inp, int32_t *c_out, size_t n_samples) {{
|
|
236
|
-
|
|
237
|
-
size_t n_max_threads = omp_get_max_threads();
|
|
238
|
-
size_t n_samples_per_thread = std::max<size_t>(n_samples / n_max_threads, 32);
|
|
239
|
-
size_t n_thread = n_samples / n_samples_per_thread;
|
|
240
|
-
n_thread += (n_samples % n_samples_per_thread) ? 1 : 0;
|
|
241
|
-
|
|
242
|
-
#pragma omp parallel for num_threads(n_thread) schedule(static)
|
|
243
|
-
for (size_t i = 0; i < n_thread; ++i) {{
|
|
244
|
-
size_t start = i * n_samples_per_thread;
|
|
245
|
-
size_t end = std::min<size_t>(start + n_samples_per_thread, n_samples);
|
|
246
|
-
size_t n_samples_this_thread = end - start;
|
|
247
|
-
|
|
248
|
-
_inference(&c_inp[start * N_inp], &c_out[start * N_out], n_samples_this_thread);
|
|
249
|
-
}}
|
|
250
|
-
#else
|
|
251
|
-
_inference(c_inp, c_out, n_samples);
|
|
252
|
-
#endif
|
|
147
|
+
batch_inference<{module_name}_config>(c_inp, c_out, n_samples);
|
|
253
148
|
}}
|
|
254
|
-
|
|
255
|
-
|
|
149
|
+
}}
|
|
150
|
+
"""
|
|
@@ -3,19 +3,37 @@ from .comb import comb_logic_gen
|
|
|
3
3
|
|
|
4
4
|
|
|
5
5
|
def pipeline_logic_gen(
|
|
6
|
-
csol: CascadedSolution,
|
|
6
|
+
csol: CascadedSolution,
|
|
7
|
+
name: str,
|
|
8
|
+
print_latency=False,
|
|
9
|
+
timescale: str | None = '`timescale 1 ns / 1 ps',
|
|
10
|
+
register_layers: int = 1,
|
|
7
11
|
):
|
|
8
12
|
N = len(csol.solutions)
|
|
9
13
|
inp_bits = [sum(map(sum, map(_minimal_kif, sol.inp_qint))) for sol in csol.solutions]
|
|
10
14
|
out_bits = inp_bits[1:] + [sum(map(sum, map(_minimal_kif, csol.out_qint)))]
|
|
11
15
|
|
|
12
16
|
registers = [f'reg [{width}-1:0] stage{i}_inp;' for i, width in enumerate(inp_bits)]
|
|
17
|
+
for i in range(0, register_layers - 1):
|
|
18
|
+
registers += [f'reg [{width}-1:0] stage{j}_inp_copy{i};' for j, width in enumerate(inp_bits)]
|
|
13
19
|
wires = [f'wire [{width}-1:0] stage{i}_out;' for i, width in enumerate(out_bits)]
|
|
14
20
|
|
|
15
21
|
comb_logic = [f'{name}_stage{i} stage{i} (.inp(stage{i}_inp), .out(stage{i}_out));' for i in range(N)]
|
|
16
22
|
|
|
17
|
-
|
|
18
|
-
|
|
23
|
+
if register_layers == 1:
|
|
24
|
+
serial_logic = ['stage0_inp <= inp;']
|
|
25
|
+
serial_logic += [f'stage{i}_inp <= stage{i-1}_out;' for i in range(1, N)]
|
|
26
|
+
else:
|
|
27
|
+
serial_logic = ['stage0_inp_copy0 <= inp;']
|
|
28
|
+
for j in range(1, register_layers - 1):
|
|
29
|
+
serial_logic.append(f'stage0_inp_copy{j} <= stage0_inp_copy{j-1};')
|
|
30
|
+
serial_logic.append(f'stage0_inp <= stage0_inp_copy{register_layers - 2};')
|
|
31
|
+
for i in range(1, N):
|
|
32
|
+
serial_logic.append(f'stage{i}_inp_copy0 <= stage{i-1}_out;')
|
|
33
|
+
for j in range(1, register_layers - 1):
|
|
34
|
+
serial_logic.append(f'stage{i}_inp_copy{j} <= stage{i}_inp_copy{j-1};')
|
|
35
|
+
serial_logic.append(f'stage{i}_inp <= stage{i}_inp_copy{register_layers - 2};')
|
|
36
|
+
|
|
19
37
|
serial_logic += [f'out <= stage{N-1}_out;']
|
|
20
38
|
|
|
21
39
|
sep0 = '\n '
|
|
@@ -0,0 +1,72 @@
|
|
|
1
|
+
#include "ioutil.hh"
|
|
2
|
+
#include <verilated.h>
|
|
3
|
+
|
|
4
|
+
#ifdef _OPENMP
|
|
5
|
+
#include <omp.h>
|
|
6
|
+
constexpr bool _openmp = true;
|
|
7
|
+
#else
|
|
8
|
+
constexpr bool _openmp = false;
|
|
9
|
+
#endif
|
|
10
|
+
|
|
11
|
+
template <typename CONFIG_T>
|
|
12
|
+
std::enable_if_t<CONFIG_T::II != 0> _inference(int32_t *c_inp, int32_t *c_out, size_t n_samples) {
|
|
13
|
+
typename CONFIG_T::dut_t *dut = new typename CONFIG_T::dut_t;
|
|
14
|
+
|
|
15
|
+
size_t clk_req = n_samples * CONFIG_T::II + CONFIG_T::latency + 1;
|
|
16
|
+
|
|
17
|
+
for (size_t t_inp = 0; t_inp < clk_req; ++t_inp) {
|
|
18
|
+
size_t t_out = t_inp - CONFIG_T::latency - 1;
|
|
19
|
+
|
|
20
|
+
if (t_inp < n_samples * CONFIG_T::II && t_inp % CONFIG_T::II == 0) {
|
|
21
|
+
write_input<CONFIG_T::N_inp, CONFIG_T::max_inp_bw>(dut->inp, &c_inp[t_inp / CONFIG_T::II * CONFIG_T::N_inp]);
|
|
22
|
+
}
|
|
23
|
+
|
|
24
|
+
dut->clk = 0;
|
|
25
|
+
dut->eval();
|
|
26
|
+
|
|
27
|
+
if (t_inp > CONFIG_T::latency && t_out % CONFIG_T::II == 0) {
|
|
28
|
+
read_output<CONFIG_T::N_out, CONFIG_T::max_out_bw>(dut->out, &c_out[t_out / CONFIG_T::II * CONFIG_T::N_out]);
|
|
29
|
+
}
|
|
30
|
+
|
|
31
|
+
dut->clk = 1;
|
|
32
|
+
dut->eval();
|
|
33
|
+
}
|
|
34
|
+
|
|
35
|
+
dut->final();
|
|
36
|
+
delete dut;
|
|
37
|
+
}
|
|
38
|
+
|
|
39
|
+
template <typename CONFIG_T>
|
|
40
|
+
std::enable_if_t<CONFIG_T::II == 0> _inference(int32_t *c_inp, int32_t *c_out, size_t n_samples) {
|
|
41
|
+
typename CONFIG_T::dut_t *dut = new typename CONFIG_T::dut_t;
|
|
42
|
+
|
|
43
|
+
for (size_t i = 0; i < n_samples; ++i) {
|
|
44
|
+
write_input<CONFIG_T::N_inp, CONFIG_T::max_inp_bw>(dut->inp, &c_inp[i * CONFIG_T::N_inp]);
|
|
45
|
+
dut->eval();
|
|
46
|
+
read_output<CONFIG_T::N_out, CONFIG_T::max_out_bw>(dut->out, &c_out[i * CONFIG_T::N_out]);
|
|
47
|
+
}
|
|
48
|
+
|
|
49
|
+
dut->final();
|
|
50
|
+
delete dut;
|
|
51
|
+
}
|
|
52
|
+
|
|
53
|
+
template <typename CONFIG_T> void batch_inference(int32_t *c_inp, int32_t *c_out, size_t n_samples) {
|
|
54
|
+
#ifdef _OPENMP
|
|
55
|
+
size_t n_max_threads = omp_get_max_threads();
|
|
56
|
+
size_t n_samples_per_thread = std::max<size_t>(n_samples / n_max_threads, 32);
|
|
57
|
+
size_t n_thread = n_samples / n_samples_per_thread;
|
|
58
|
+
n_thread += (n_samples % n_samples_per_thread) ? 1 : 0;
|
|
59
|
+
|
|
60
|
+
#pragma omp parallel for num_threads(n_thread) schedule(static)
|
|
61
|
+
for (size_t i = 0; i < n_thread; ++i) {
|
|
62
|
+
size_t start = i * n_samples_per_thread;
|
|
63
|
+
size_t end = std::min<size_t>(start + n_samples_per_thread, n_samples);
|
|
64
|
+
size_t n_samples_this_thread = end - start;
|
|
65
|
+
size_t offset_in = start * CONFIG_T::N_inp;
|
|
66
|
+
size_t offset_out = start * CONFIG_T::N_out;
|
|
67
|
+
_inference<CONFIG_T>(&c_inp[offset_in], &c_out[offset_out], n_samples_this_thread);
|
|
68
|
+
}
|
|
69
|
+
#else
|
|
70
|
+
_inference<CONFIG_T>(c_inp, c_out, n_samples);
|
|
71
|
+
#endif
|
|
72
|
+
}
|
|
@@ -26,7 +26,6 @@ file mkdir "${output_dir}/reports"
|
|
|
26
26
|
# synth
|
|
27
27
|
synth_design -top $top_module -mode out_of_context -retiming \
|
|
28
28
|
-flatten_hierarchy rebuilt -resource_sharing auto \
|
|
29
|
-
-keep_equivalent_registers -shreg_min_size 8 \
|
|
30
29
|
-directive AlternateRoutability
|
|
31
30
|
|
|
32
31
|
write_checkpoint -force "${output_dir}/${project_name}_post_synth.dcp"
|
|
@@ -0,0 +1,58 @@
|
|
|
1
|
+
`timescale 1ns / 1ps
|
|
2
|
+
|
|
3
|
+
|
|
4
|
+
module mux #(
|
|
5
|
+
parameter BW_INPUT0 = 32,
|
|
6
|
+
parameter BW_INPUT1 = 32,
|
|
7
|
+
parameter SIGNED0 = 0,
|
|
8
|
+
parameter SIGNED1 = 0,
|
|
9
|
+
parameter BW_OUT = 32,
|
|
10
|
+
parameter SHIFT1 = 0,
|
|
11
|
+
parameter INVERT1 = 0
|
|
12
|
+
) (
|
|
13
|
+
input key,
|
|
14
|
+
input [BW_INPUT0-1:0] in0,
|
|
15
|
+
input [BW_INPUT1-1:0] in1,
|
|
16
|
+
output [BW_OUT-1:0] out
|
|
17
|
+
);
|
|
18
|
+
|
|
19
|
+
localparam IN0_NEED_BITS = (SHIFT1 < 0) ? BW_INPUT0 - SHIFT1 : BW_INPUT0;
|
|
20
|
+
localparam IN1_NEED_BITS = (SHIFT1 > 0) ? BW_INPUT1 + SHIFT1 : BW_INPUT1;
|
|
21
|
+
localparam EXTRA_PAD = (SIGNED0 != SIGNED1) ? INVERT1 + 1 : INVERT1 + 0;
|
|
22
|
+
localparam BW_BUF = (IN0_NEED_BITS > IN1_NEED_BITS) ? IN0_NEED_BITS + EXTRA_PAD : IN1_NEED_BITS + EXTRA_PAD;
|
|
23
|
+
localparam IN0_PAD_LEFT = (SHIFT1 < 0) ? BW_BUF - BW_INPUT0 + SHIFT1 : BW_BUF - BW_INPUT0;
|
|
24
|
+
localparam IN0_PAD_RIGHT = (SHIFT1 < 0) ? -SHIFT1 : 0;
|
|
25
|
+
localparam IN1_PAD_LEFT = (SHIFT1 > 0) ? BW_BUF - BW_INPUT1 - SHIFT1 : BW_BUF - BW_INPUT1;
|
|
26
|
+
localparam IN1_PAD_RIGHT = (SHIFT1 > 0) ? SHIFT1 : 0;
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
// verilator lint_off UNUSEDSIGNAL
|
|
30
|
+
wire [BW_BUF-1:0] in0_ext;
|
|
31
|
+
wire [BW_BUF-1:0] in1_ext;
|
|
32
|
+
// verilator lint_on UNUSEDSIGNAL
|
|
33
|
+
|
|
34
|
+
generate
|
|
35
|
+
if (SIGNED0 == 1) begin : in0_is_signed
|
|
36
|
+
assign in0_ext = {{IN0_PAD_LEFT{in0[BW_INPUT0-1]}}, in0, {IN0_PAD_RIGHT{1'b0}}};
|
|
37
|
+
end else begin : in0_is_unsigned
|
|
38
|
+
assign in0_ext = {{IN0_PAD_LEFT{1'b0}}, in0, {IN0_PAD_RIGHT{1'b0}}};
|
|
39
|
+
end
|
|
40
|
+
endgenerate
|
|
41
|
+
|
|
42
|
+
generate
|
|
43
|
+
if (SIGNED1 == 1) begin : in1_is_signed
|
|
44
|
+
assign in1_ext = {{IN1_PAD_LEFT{in1[BW_INPUT1-1]}}, in1, {IN1_PAD_RIGHT{1'b0}}};
|
|
45
|
+
end else begin : in1_is_unsigned
|
|
46
|
+
assign in1_ext = {{IN1_PAD_LEFT{1'b0}}, in1, {IN1_PAD_RIGHT{1'b0}}};
|
|
47
|
+
end
|
|
48
|
+
endgenerate
|
|
49
|
+
|
|
50
|
+
generate
|
|
51
|
+
if (INVERT1 == 1) begin : is_invert
|
|
52
|
+
assign out = (key) ? in0_ext[BW_OUT-1:0] : -in1_ext[BW_OUT-1:0];
|
|
53
|
+
end else begin : is_not_invert
|
|
54
|
+
assign out = (key) ? in0_ext[BW_OUT-1:0] : in1_ext[BW_OUT-1:0];
|
|
55
|
+
end
|
|
56
|
+
endgenerate
|
|
57
|
+
|
|
58
|
+
endmodule
|
|
@@ -0,0 +1,28 @@
|
|
|
1
|
+
`timescale 1ns / 1ps
|
|
2
|
+
|
|
3
|
+
|
|
4
|
+
module negative #(
|
|
5
|
+
parameter BW_IN = 32,
|
|
6
|
+
parameter BW_OUT = 32,
|
|
7
|
+
parameter IN_SIGNED = 0
|
|
8
|
+
) (
|
|
9
|
+
// verilator lint_off UNUSEDSIGNAL
|
|
10
|
+
input [ BW_IN-1:0] in,
|
|
11
|
+
// verilator lint_off UNUSEDSIGNAL
|
|
12
|
+
output [BW_OUT-1:0] out
|
|
13
|
+
);
|
|
14
|
+
generate
|
|
15
|
+
if (BW_IN < BW_OUT) begin : in_is_smaller
|
|
16
|
+
wire [BW_OUT-1:0] in_ext;
|
|
17
|
+
if (IN_SIGNED == 1) begin : is_signed
|
|
18
|
+
assign in_ext = {{BW_OUT - BW_IN{in[BW_IN-1]}}, in};
|
|
19
|
+
end else begin : is_unsigned
|
|
20
|
+
assign in_ext = {{BW_OUT - BW_IN{1'b0}}, in};
|
|
21
|
+
end
|
|
22
|
+
assign out = -in_ext;
|
|
23
|
+
end else begin : in_is_bigger
|
|
24
|
+
assign out = -in[BW_OUT-1:0];
|
|
25
|
+
end
|
|
26
|
+
endgenerate
|
|
27
|
+
|
|
28
|
+
endmodule
|
|
@@ -17,7 +17,7 @@ module shift_adder #(
|
|
|
17
17
|
|
|
18
18
|
localparam IN0_NEED_BITS = (SHIFT1 < 0) ? BW_INPUT0 - SHIFT1 : BW_INPUT0;
|
|
19
19
|
localparam IN1_NEED_BITS = (SHIFT1 > 0) ? BW_INPUT1 + SHIFT1 : BW_INPUT1;
|
|
20
|
-
localparam EXTRA_PAD = (SIGNED0 != SIGNED1) ? IS_SUB+1 : IS_SUB+0;
|
|
20
|
+
localparam EXTRA_PAD = (SIGNED0 != SIGNED1) ? IS_SUB + 1 : IS_SUB + 0;
|
|
21
21
|
localparam BW_ADD = (IN0_NEED_BITS > IN1_NEED_BITS) ? IN0_NEED_BITS + EXTRA_PAD + 1 : IN1_NEED_BITS + EXTRA_PAD + 1;
|
|
22
22
|
localparam IN0_PAD_LEFT = (SHIFT1 < 0) ? BW_ADD - BW_INPUT0 + SHIFT1 : BW_ADD - BW_INPUT0;
|
|
23
23
|
localparam IN0_PAD_RIGHT = (SHIFT1 < 0) ? -SHIFT1 : 0;
|
|
@@ -37,6 +37,9 @@ module shift_adder #(
|
|
|
37
37
|
end else begin : in0_is_unsigned
|
|
38
38
|
assign in0_ext = {{IN0_PAD_LEFT{1'b0}}, in0, {IN0_PAD_RIGHT{1'b0}}};
|
|
39
39
|
end
|
|
40
|
+
endgenerate
|
|
41
|
+
|
|
42
|
+
generate
|
|
40
43
|
if (SIGNED1 == 1) begin : in1_is_signed
|
|
41
44
|
assign in1_ext = {{IN1_PAD_LEFT{in1[BW_INPUT1-1]}}, in1, {IN1_PAD_RIGHT{1'b0}}};
|
|
42
45
|
end else begin : in1_is_unsigned
|
|
@@ -27,3 +27,6 @@ set_clock_uncertainty -setup $uncertainty_setup [get_clocks sys_clk]
|
|
|
27
27
|
set_clock_uncertainty -hold $uncertainty_hold [get_clocks sys_clk]
|
|
28
28
|
|
|
29
29
|
set_property HD.CLK_SRC BUFG_X0Y0 [get_ports clk]
|
|
30
|
+
|
|
31
|
+
set_property retiming_forward 1 [get_cells {stage[*]_inp}]
|
|
32
|
+
set_property retiming_backward 1 [get_cells {stage[*]_inp}]
|
|
@@ -13,7 +13,7 @@ from numpy.typing import NDArray
|
|
|
13
13
|
from ... import codegen
|
|
14
14
|
from ...cmvm.types import CascadedSolution, Solution, _minimal_kif
|
|
15
15
|
from ...trace.pipeline import to_pipeline
|
|
16
|
-
from . import
|
|
16
|
+
from . import binder_gen, comb_logic_gen, generate_io_wrapper, pipeline_logic_gen
|
|
17
17
|
|
|
18
18
|
|
|
19
19
|
def get_io_kifs(sol: Solution | CascadedSolution):
|
|
@@ -34,6 +34,7 @@ class VerilogModel:
|
|
|
34
34
|
clock_period: int = 5,
|
|
35
35
|
clock_uncertainty: float = 0.1,
|
|
36
36
|
io_delay_minmax: tuple[float, float] = (0.2, 0.4),
|
|
37
|
+
register_layers: int = 1,
|
|
37
38
|
):
|
|
38
39
|
self._solution = solution
|
|
39
40
|
self._path = Path(path)
|
|
@@ -45,6 +46,7 @@ class VerilogModel:
|
|
|
45
46
|
self._clock_period = clock_period
|
|
46
47
|
self._clock_uncertainty = clock_uncertainty
|
|
47
48
|
self._io_delay_minmax = io_delay_minmax
|
|
49
|
+
self._register_layers = register_layers
|
|
48
50
|
|
|
49
51
|
self._pipe = solution if isinstance(solution, CascadedSolution) else None
|
|
50
52
|
if latency_cutoff > 0 and self._pipe is None:
|
|
@@ -57,12 +59,13 @@ class VerilogModel:
|
|
|
57
59
|
self._latency_cutoff = latency_cutoff
|
|
58
60
|
|
|
59
61
|
self._lib = None
|
|
62
|
+
self._uuid = None
|
|
60
63
|
|
|
61
64
|
def write(self):
|
|
62
65
|
self._path.mkdir(parents=True, exist_ok=True)
|
|
63
66
|
if self._pipe is not None: # Pipeline
|
|
64
67
|
# Main logic
|
|
65
|
-
codes = pipeline_logic_gen(self._pipe, self._prj_name, self._print_latency)
|
|
68
|
+
codes = pipeline_logic_gen(self._pipe, self._prj_name, self._print_latency, register_layers=self._register_layers)
|
|
66
69
|
for k, v in codes.items():
|
|
67
70
|
with open(self._path / f'{k}.v', 'w') as f:
|
|
68
71
|
f.write(v)
|
|
@@ -86,8 +89,8 @@ class VerilogModel:
|
|
|
86
89
|
with open(self._path / f'{self._prj_name}.xdc', 'w') as f:
|
|
87
90
|
f.write(xdc)
|
|
88
91
|
|
|
89
|
-
# C++ binder w/
|
|
90
|
-
binder =
|
|
92
|
+
# C++ binder w/ verilog wrapper for uniform bw
|
|
93
|
+
binder = binder_gen(self._pipe, f'{self._prj_name}_wrapper', 1, self._register_layers)
|
|
91
94
|
|
|
92
95
|
# Verilog IO wrapper (non-uniform bw to uniform one, clk passthrough)
|
|
93
96
|
io_wrapper = generate_io_wrapper(self._pipe, self._prj_name, True)
|
|
@@ -103,7 +106,7 @@ class VerilogModel:
|
|
|
103
106
|
|
|
104
107
|
# Verilog IO wrapper (non-uniform bw to uniform one, no clk)
|
|
105
108
|
io_wrapper = generate_io_wrapper(self._solution, self._prj_name, False)
|
|
106
|
-
binder =
|
|
109
|
+
binder = binder_gen(self._solution, f'{self._prj_name}_wrapper')
|
|
107
110
|
|
|
108
111
|
with open(self._path / f'{self._prj_name}_wrapper.v', 'w') as f:
|
|
109
112
|
f.write(io_wrapper)
|
|
@@ -112,13 +115,16 @@ class VerilogModel:
|
|
|
112
115
|
|
|
113
116
|
# Common resource copy
|
|
114
117
|
shutil.copy(self.__src_root / 'verilog/source/shift_adder.v', self._path)
|
|
118
|
+
shutil.copy(self.__src_root / 'verilog/source/mux.v', self._path)
|
|
119
|
+
shutil.copy(self.__src_root / 'verilog/source/negative.v', self._path)
|
|
115
120
|
shutil.copy(self.__src_root / 'verilog/source/build_binder.mk', self._path)
|
|
116
|
-
shutil.copy(self.__src_root / 'verilog/source/
|
|
121
|
+
shutil.copy(self.__src_root / 'verilog/source/ioutil.hh', self._path)
|
|
122
|
+
shutil.copy(self.__src_root / 'verilog/source/binder_util.hh', self._path)
|
|
117
123
|
self._solution.save(self._path / 'model.json')
|
|
118
124
|
with open(self._path / 'misc.json', 'w') as f:
|
|
119
125
|
f.write(f'{{"cost": {self._solution.cost}}}')
|
|
120
126
|
|
|
121
|
-
def _compile(self, verbose=False, openmp=True, o3: bool = False, clean=True):
|
|
127
|
+
def _compile(self, verbose=False, openmp=True, nproc=None, o3: bool = False, clean=True):
|
|
122
128
|
"""Same as compile, but will not write to the library
|
|
123
129
|
|
|
124
130
|
Parameters
|
|
@@ -127,6 +133,9 @@ class VerilogModel:
|
|
|
127
133
|
Verbose output, by default False
|
|
128
134
|
openmp : bool, optional
|
|
129
135
|
Enable openmp, by default True
|
|
136
|
+
nproc : int | None, optional
|
|
137
|
+
Number of processes to use for compilation, by default None
|
|
138
|
+
If None, will use the number of CPU cores, but not more than 32.
|
|
130
139
|
o3 : bool | None, optional
|
|
131
140
|
Turn on -O3 flag, by default False
|
|
132
141
|
clean : bool, optional
|
|
@@ -144,14 +153,20 @@ class VerilogModel:
|
|
|
144
153
|
env['VM_PREFIX'] = f'{self._prj_name}_wrapper'
|
|
145
154
|
env['STAMP'] = self._uuid
|
|
146
155
|
env['EXTRA_CXXFLAGS'] = '-fopenmp' if openmp else ''
|
|
156
|
+
if nproc is not None:
|
|
157
|
+
env['N_JOBS'] = str(nproc)
|
|
147
158
|
if o3:
|
|
148
159
|
args.append('fast')
|
|
149
160
|
|
|
150
|
-
if clean:
|
|
161
|
+
if clean is not False:
|
|
151
162
|
m = re.compile(r'^lib.*[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}\.so$')
|
|
152
163
|
for p in self._path.iterdir():
|
|
153
164
|
if not p.is_dir() and m.match(p.name):
|
|
154
165
|
p.unlink()
|
|
166
|
+
if clean:
|
|
167
|
+
subprocess.run(
|
|
168
|
+
['make', '-f', 'build_binder.mk', 'clean'], env=env, cwd=self._path, check=True, capture_output=not verbose
|
|
169
|
+
)
|
|
155
170
|
|
|
156
171
|
try:
|
|
157
172
|
r = subprocess.run(args, env=env, check=True, cwd=self._path, capture_output=not verbose)
|
|
@@ -168,13 +183,19 @@ class VerilogModel:
|
|
|
168
183
|
|
|
169
184
|
def _load_lib(self, uuid: str | None = None):
|
|
170
185
|
uuid = uuid if uuid is not None else self._uuid
|
|
186
|
+
if uuid is None:
|
|
187
|
+
# load .so if there is only one, otherwise raise an error
|
|
188
|
+
libs = list(self._path.glob(f'lib{self._prj_name}_wrapper_*.so'))
|
|
189
|
+
if len(libs) == 0:
|
|
190
|
+
raise RuntimeError(f'Cannot load library, found {len(libs)} libraries in {self._path}')
|
|
191
|
+
uuid = libs[0].name.split('_')[-1].split('.', 1)[0]
|
|
171
192
|
self._uuid = uuid
|
|
172
193
|
lib_path = self._path / f'lib{self._prj_name}_wrapper_{uuid}.so'
|
|
173
194
|
if not lib_path.exists():
|
|
174
195
|
raise RuntimeError(f'Library {lib_path} does not exist')
|
|
175
196
|
self._lib = ctypes.CDLL(str(lib_path))
|
|
176
197
|
|
|
177
|
-
def compile(self, verbose=False, openmp=True, o3: bool = False):
|
|
198
|
+
def compile(self, verbose=False, openmp=True, nproc: int | None = None, o3: bool = False, clean=True):
|
|
178
199
|
"""Compile the generated code to a emulator for logic simulation.
|
|
179
200
|
|
|
180
201
|
Parameters
|
|
@@ -183,8 +204,13 @@ class VerilogModel:
|
|
|
183
204
|
Verbose output, by default False
|
|
184
205
|
openmp : bool, optional
|
|
185
206
|
Enable openmp, by default True
|
|
207
|
+
nproc : int | None, optional
|
|
208
|
+
Number of processes to use for compilation, by default None
|
|
209
|
+
If None, will use the number of CPU cores, but not more than 32.
|
|
186
210
|
o3 : bool | None, optional
|
|
187
211
|
Turn on -O3 flag, by default False
|
|
212
|
+
clean : bool, optional
|
|
213
|
+
Remove obsolete shared object files, by default True
|
|
188
214
|
|
|
189
215
|
Raises
|
|
190
216
|
------
|
|
@@ -192,8 +218,7 @@ class VerilogModel:
|
|
|
192
218
|
If compilation fails
|
|
193
219
|
"""
|
|
194
220
|
self.write()
|
|
195
|
-
self._compile(verbose=verbose, openmp=openmp, o3=o3)
|
|
196
|
-
self._load_lib()
|
|
221
|
+
self._compile(verbose=verbose, openmp=openmp, nproc=nproc, o3=o3, clean=clean)
|
|
197
222
|
|
|
198
223
|
def predict(self, data: NDArray[np.floating]):
|
|
199
224
|
"""Run the model on the input data.
|
|
@@ -225,7 +250,7 @@ class VerilogModel:
|
|
|
225
250
|
out_data = np.empty(n_sample * out_size, dtype=np.int32)
|
|
226
251
|
|
|
227
252
|
# Convert to int32 matching the LSB position
|
|
228
|
-
inp_data[:] = data.ravel() * 2.0
|
|
253
|
+
inp_data[:] = np.floor(data.ravel() * 2.0**f_in)
|
|
229
254
|
|
|
230
255
|
inp_buf = inp_data.ctypes.data_as(ctypes.POINTER(ctypes.c_int32))
|
|
231
256
|
out_buf = out_data.ctypes.data_as(ctypes.POINTER(ctypes.c_int32))
|
|
@@ -233,7 +258,7 @@ class VerilogModel:
|
|
|
233
258
|
|
|
234
259
|
# Unscale the output int32 to recover fp values
|
|
235
260
|
k, i, f = np.max(k_out), np.max(i_out), np.max(f_out)
|
|
236
|
-
a, b, c = 2.0 ** (k + i + f), 2.0 ** (i + f), 2.0**-f
|
|
261
|
+
a, b, c = 2.0 ** (k + i + f), k * 2.0 ** (i + f), 2.0**-f
|
|
237
262
|
return ((out_data.reshape(n_sample, out_size) + b) % a - b) * c
|
|
238
263
|
|
|
239
264
|
def __repr__(self):
|
|
@@ -243,11 +268,12 @@ class VerilogModel:
|
|
|
243
268
|
in_bits, out_bits = np.sum(kifs_in), np.sum(kifs_out)
|
|
244
269
|
if self._pipe is not None:
|
|
245
270
|
n_stage = len(self._pipe[0])
|
|
271
|
+
delay_suffix = '' if self._register_layers == 1 else f'x {self._register_layers} '
|
|
246
272
|
lat_cutoff = self._latency_cutoff
|
|
247
273
|
reg_bits = self._pipe.reg_bits
|
|
248
274
|
spec = f"""Top Module: {self._prj_name}\n====================
|
|
249
275
|
{inp_size} ({in_bits} bits) -> {out_size} ({out_bits} bits)
|
|
250
|
-
{n_stage} stages @ max_delay={lat_cutoff}
|
|
276
|
+
{n_stage} {delay_suffix}stages @ max_delay={lat_cutoff}
|
|
251
277
|
Estimated cost: {cost} LUTs, {reg_bits} FFs"""
|
|
252
278
|
|
|
253
279
|
else:
|
|
@@ -258,7 +284,8 @@ Estimated cost: {cost} LUTs"""
|
|
|
258
284
|
|
|
259
285
|
is_compiled = self._lib is not None
|
|
260
286
|
if is_compiled:
|
|
261
|
-
|
|
287
|
+
assert self._uuid is not None
|
|
288
|
+
openmp = 'with OpenMP' if self._lib.openmp_enabled() else '' # type: ignore
|
|
262
289
|
spec += f'\nEmulator is compiled {openmp} ({self._uuid[-12:]})'
|
|
263
290
|
else:
|
|
264
291
|
spec += '\nEmulator is **not compiled**'
|
|
File without changes
|