da4ml 0.1.1__py3-none-any.whl → 0.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of da4ml might be problematic. Click here for more details.
- da4ml/__init__.py +16 -16
- da4ml/_version.py +2 -2
- da4ml/cmvm/__init__.py +3 -34
- da4ml/cmvm/api.py +239 -73
- da4ml/cmvm/core/__init__.py +222 -0
- da4ml/cmvm/core/indexers.py +83 -0
- da4ml/cmvm/core/state_opr.py +284 -0
- da4ml/cmvm/types.py +569 -0
- da4ml/cmvm/util/__init__.py +7 -0
- da4ml/cmvm/util/bit_decompose.py +86 -0
- da4ml/cmvm/util/mat_decompose.py +121 -0
- da4ml/codegen/__init__.py +11 -0
- da4ml/codegen/cpp/__init__.py +3 -0
- da4ml/codegen/cpp/cpp_codegen.py +148 -0
- da4ml/codegen/cpp/source/vitis.h +30 -0
- da4ml/codegen/cpp/source/vitis_bridge.h +17 -0
- da4ml/codegen/verilog/__init__.py +13 -0
- da4ml/codegen/verilog/comb.py +146 -0
- da4ml/codegen/verilog/io_wrapper.py +255 -0
- da4ml/codegen/verilog/pipeline.py +49 -0
- da4ml/codegen/verilog/source/build_binder.mk +27 -0
- da4ml/codegen/verilog/source/build_prj.tcl +75 -0
- da4ml/codegen/verilog/source/ioutils.hh +117 -0
- da4ml/codegen/verilog/source/shift_adder.v +56 -0
- da4ml/codegen/verilog/source/template.xdc +29 -0
- da4ml/codegen/verilog/verilog_model.py +265 -0
- da4ml/trace/__init__.py +6 -0
- da4ml/trace/fixed_variable.py +358 -0
- da4ml/trace/fixed_variable_array.py +177 -0
- da4ml/trace/ops/__init__.py +55 -0
- da4ml/trace/ops/conv_utils.py +104 -0
- da4ml/trace/ops/einsum_utils.py +299 -0
- da4ml/trace/pipeline.py +155 -0
- da4ml/trace/tracer.py +120 -0
- da4ml-0.2.0.dist-info/METADATA +65 -0
- da4ml-0.2.0.dist-info/RECORD +39 -0
- {da4ml-0.1.1.dist-info → da4ml-0.2.0.dist-info}/WHEEL +1 -1
- da4ml/cmvm/balanced_reduction.py +0 -46
- da4ml/cmvm/cmvm.py +0 -328
- da4ml/cmvm/codegen.py +0 -159
- da4ml/cmvm/csd.py +0 -73
- da4ml/cmvm/fixed_variable.py +0 -205
- da4ml/cmvm/graph_compile.py +0 -85
- da4ml/cmvm/nb_fixed_precision.py +0 -98
- da4ml/cmvm/scoring.py +0 -55
- da4ml/cmvm/utils.py +0 -5
- da4ml-0.1.1.dist-info/METADATA +0 -121
- da4ml-0.1.1.dist-info/RECORD +0 -18
- {da4ml-0.1.1.dist-info → da4ml-0.2.0.dist-info/licenses}/LICENSE +0 -0
- {da4ml-0.1.1.dist-info → da4ml-0.2.0.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,255 @@
|
|
|
1
|
+
from itertools import accumulate
|
|
2
|
+
|
|
3
|
+
from ...cmvm.types import CascadedSolution, QInterval, Solution, _minimal_kif
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
def hetero_io_map(qints: list[QInterval], merge: bool = False):
|
|
7
|
+
N = len(qints)
|
|
8
|
+
ks, _is, fs = zip(*map(_minimal_kif, qints))
|
|
9
|
+
Is = [_i + _k for _i, _k in zip(_is, ks)]
|
|
10
|
+
max_I, max_f = max(Is), max(fs)
|
|
11
|
+
max_bw = max_I + max_f
|
|
12
|
+
width_regular, width_packed = max_bw * N, sum(Is) + sum(fs)
|
|
13
|
+
|
|
14
|
+
regular: list[tuple[int, int]] = []
|
|
15
|
+
pads: list[tuple[int, int, int]] = []
|
|
16
|
+
|
|
17
|
+
bws = [I + f for I, f in zip(Is, fs)]
|
|
18
|
+
_bw = list(accumulate([0] + bws))
|
|
19
|
+
hetero = [(i - 1, j) for i, j in zip(_bw[1:], _bw[:-1])]
|
|
20
|
+
|
|
21
|
+
for i in range(N):
|
|
22
|
+
base = max_bw * i
|
|
23
|
+
bias_low = max_f - fs[i]
|
|
24
|
+
bias_high = max_I - Is[i]
|
|
25
|
+
low = base + bias_low
|
|
26
|
+
high = (base + max_bw - 1) - bias_high
|
|
27
|
+
regular.append((high, low))
|
|
28
|
+
|
|
29
|
+
if bias_low != 0:
|
|
30
|
+
pads.append((base + bias_low - 1, base, -1))
|
|
31
|
+
if bias_high != 0:
|
|
32
|
+
copy_from = hetero[i][0] if ks[i] else -1
|
|
33
|
+
pads.append((base + max_bw - 1, base + max_bw - bias_high, copy_from))
|
|
34
|
+
|
|
35
|
+
if not merge:
|
|
36
|
+
return regular, hetero, pads, (width_regular, width_packed)
|
|
37
|
+
|
|
38
|
+
# Merging consecutive intervals when possible
|
|
39
|
+
for i in range(N - 2, -1, -1):
|
|
40
|
+
this_high = regular[i][0]
|
|
41
|
+
next_low = regular[i + 1][1]
|
|
42
|
+
if next_low - this_high != 1:
|
|
43
|
+
continue
|
|
44
|
+
regular[i] = (regular[i + 1][0], regular[i][1])
|
|
45
|
+
regular.pop(i + 1)
|
|
46
|
+
hetero[i] = (hetero[i + 1][0], hetero[i][1])
|
|
47
|
+
hetero.pop(i + 1)
|
|
48
|
+
|
|
49
|
+
for i in range(len(pads) - 2, -1, -1):
|
|
50
|
+
if pads[i + 1][1] - pads[i][0] == 1 and pads[i][2] == pads[i + 1][2]:
|
|
51
|
+
pads[i] = (pads[i + 1][0], pads[i][1], pads[i][2])
|
|
52
|
+
pads.pop(i + 1)
|
|
53
|
+
|
|
54
|
+
return regular, hetero, pads, (width_regular, width_packed)
|
|
55
|
+
|
|
56
|
+
|
|
57
|
+
def generate_io_wrapper(sol: Solution | CascadedSolution, module_name: str, pipelined: bool = False):
|
|
58
|
+
reg_in, het_in, _, shape_in = hetero_io_map(sol.inp_qint, merge=True)
|
|
59
|
+
reg_out, het_out, pad_out, shape_out = hetero_io_map(sol.out_qint, merge=True)
|
|
60
|
+
|
|
61
|
+
w_reg_in, w_het_in = shape_in
|
|
62
|
+
w_reg_out, w_het_out = shape_out
|
|
63
|
+
|
|
64
|
+
inp_assignment = [f'assign packed_inp[{ih}:{jh}] = inp[{ir}:{jr}];' for (ih, jh), (ir, jr) in zip(het_in, reg_in)]
|
|
65
|
+
_out_assignment: list[tuple[int, str]] = []
|
|
66
|
+
|
|
67
|
+
for i, ((ih, jh), (ir, jr)) in enumerate(zip(het_out, reg_out)):
|
|
68
|
+
_out_assignment.append((ih, f'assign out[{ir}:{jr}] = packed_out[{ih}:{jh}];'))
|
|
69
|
+
|
|
70
|
+
for i, (i, j, copy_from) in enumerate(pad_out):
|
|
71
|
+
n_bit = i - j + 1
|
|
72
|
+
pad = f"{n_bit}'b0" if copy_from == -1 else f'{{{n_bit}{{packed_out[{copy_from}]}}}}'
|
|
73
|
+
_out_assignment.append((i, f'assign out[{i}:{j}] = {pad};'))
|
|
74
|
+
_out_assignment.sort(key=lambda x: x[0])
|
|
75
|
+
out_assignment = [v for _, v in _out_assignment]
|
|
76
|
+
|
|
77
|
+
inp_assignment_str = '\n '.join(inp_assignment)
|
|
78
|
+
out_assignment_str = '\n '.join(out_assignment)
|
|
79
|
+
|
|
80
|
+
clk_and_rst_inp, clk_and_rst_bind = '', ''
|
|
81
|
+
if pipelined:
|
|
82
|
+
clk_and_rst_inp = '\n input clk,'
|
|
83
|
+
clk_and_rst_bind = '\n .clk(clk),'
|
|
84
|
+
|
|
85
|
+
return f"""`timescale 1 ns / 1 ps
|
|
86
|
+
|
|
87
|
+
module {module_name}_wrapper ({clk_and_rst_inp}
|
|
88
|
+
// verilator lint_off UNUSEDSIGNAL
|
|
89
|
+
input [{w_reg_in-1}:0] inp,
|
|
90
|
+
// verilator lint_on UNUSEDSIGNAL
|
|
91
|
+
output [{w_reg_out-1}:0] out
|
|
92
|
+
);
|
|
93
|
+
wire [{w_het_in-1}:0] packed_inp;
|
|
94
|
+
wire [{w_het_out-1}:0] packed_out;
|
|
95
|
+
|
|
96
|
+
{inp_assignment_str}
|
|
97
|
+
|
|
98
|
+
{module_name} op ({clk_and_rst_bind}
|
|
99
|
+
.inp(packed_inp),
|
|
100
|
+
.out(packed_out)
|
|
101
|
+
);
|
|
102
|
+
|
|
103
|
+
{out_assignment_str}
|
|
104
|
+
|
|
105
|
+
endmodule
|
|
106
|
+
"""
|
|
107
|
+
|
|
108
|
+
|
|
109
|
+
def comb_binder_gen(sol: Solution, module_name: str):
|
|
110
|
+
k_in, i_in, f_in = zip(*map(_minimal_kif, sol.inp_qint))
|
|
111
|
+
k_out, i_out, f_out = zip(*map(_minimal_kif, sol.out_qint))
|
|
112
|
+
max_inp_bw = max(k + i for k, i in zip(k_in, i_in)) + max(f_in)
|
|
113
|
+
max_out_bw = max(k + i for k, i in zip(k_out, i_out)) + max(f_out)
|
|
114
|
+
|
|
115
|
+
n_in, n_out = sol.shape
|
|
116
|
+
return f"""#include "V{module_name}.h"
|
|
117
|
+
#include "ioutils.hh"
|
|
118
|
+
#include <verilated.h>
|
|
119
|
+
|
|
120
|
+
#ifdef _OPENMP
|
|
121
|
+
#include <omp.h>
|
|
122
|
+
constexpr bool _openmp = true;
|
|
123
|
+
#else
|
|
124
|
+
constexpr bool _openmp = false;
|
|
125
|
+
#endif
|
|
126
|
+
|
|
127
|
+
constexpr size_t N_inp = {n_in};
|
|
128
|
+
constexpr size_t N_out = {n_out};
|
|
129
|
+
constexpr size_t max_inp_bw = {max_inp_bw};
|
|
130
|
+
constexpr size_t max_out_bw = {max_out_bw};
|
|
131
|
+
typedef V{module_name} dut_t;
|
|
132
|
+
|
|
133
|
+
extern "C" {{
|
|
134
|
+
|
|
135
|
+
bool openmp_enabled() {{
|
|
136
|
+
return _openmp;
|
|
137
|
+
}}
|
|
138
|
+
|
|
139
|
+
void _inference(int32_t *c_inp, int32_t *c_out, size_t n_samples) {{
|
|
140
|
+
dut_t *dut = new dut_t;
|
|
141
|
+
|
|
142
|
+
for (size_t i = 0; i < n_samples; ++i) {{
|
|
143
|
+
write_input<N_inp, max_inp_bw>(dut->inp, &c_inp[i * N_inp]);
|
|
144
|
+
dut->eval();
|
|
145
|
+
read_output<N_out, max_out_bw>(dut->out, &c_out[i * N_out]);
|
|
146
|
+
}}
|
|
147
|
+
|
|
148
|
+
dut->final();
|
|
149
|
+
delete dut;
|
|
150
|
+
}}
|
|
151
|
+
|
|
152
|
+
void inference(int32_t *c_inp, int32_t *c_out, size_t n_samples) {{
|
|
153
|
+
size_t n_max_threads = omp_get_max_threads();
|
|
154
|
+
size_t n_samples_per_thread = std::max<size_t>(n_samples / n_max_threads, 32);
|
|
155
|
+
size_t n_thread = n_samples / n_samples_per_thread;
|
|
156
|
+
n_thread += (n_samples % n_samples_per_thread) ? 1 : 0;
|
|
157
|
+
|
|
158
|
+
#ifdef _OPENMP
|
|
159
|
+
#pragma omp parallel for num_threads(n_thread) schedule(static)
|
|
160
|
+
for (size_t i = 0; i < n_thread; ++i) {{
|
|
161
|
+
size_t start = i * n_samples_per_thread;
|
|
162
|
+
size_t end = std::min<size_t>(start + n_samples_per_thread, n_samples);
|
|
163
|
+
size_t n_samples_this_thread = end - start;
|
|
164
|
+
|
|
165
|
+
_inference(&c_inp[start * N_inp], &c_out[start * N_out], n_samples_this_thread);
|
|
166
|
+
}}
|
|
167
|
+
#else
|
|
168
|
+
_inference(c_inp, c_out, n_samples);
|
|
169
|
+
#endif
|
|
170
|
+
}}
|
|
171
|
+
}}"""
|
|
172
|
+
|
|
173
|
+
|
|
174
|
+
def pipeline_binder_gen(csol: CascadedSolution, module_name: str, II: int = 1):
|
|
175
|
+
k_in, i_in, f_in = zip(*map(_minimal_kif, csol.inp_qint))
|
|
176
|
+
k_out, i_out, f_out = zip(*map(_minimal_kif, csol.out_qint))
|
|
177
|
+
max_inp_bw = max(k + i for k, i in zip(k_in, i_in)) + max(f_in)
|
|
178
|
+
max_out_bw = max(k + i for k, i in zip(k_out, i_out)) + max(f_out)
|
|
179
|
+
|
|
180
|
+
n_stage = len(csol.solutions)
|
|
181
|
+
|
|
182
|
+
n_in, n_out = csol.shape
|
|
183
|
+
return f"""#include "V{module_name}.h"
|
|
184
|
+
#include "ioutils.hh"
|
|
185
|
+
#include <verilated.h>
|
|
186
|
+
|
|
187
|
+
#ifdef _OPENMP
|
|
188
|
+
#include <omp.h>
|
|
189
|
+
constexpr bool _openmp = true;
|
|
190
|
+
#else
|
|
191
|
+
constexpr bool _openmp = false;
|
|
192
|
+
#endif
|
|
193
|
+
|
|
194
|
+
constexpr size_t N_inp = {n_in};
|
|
195
|
+
constexpr size_t N_out = {n_out};
|
|
196
|
+
constexpr size_t max_inp_bw = {max_inp_bw};
|
|
197
|
+
constexpr size_t max_out_bw = {max_out_bw};
|
|
198
|
+
constexpr size_t II = {II};
|
|
199
|
+
constexpr size_t latency = {n_stage};
|
|
200
|
+
typedef V{module_name} dut_t;
|
|
201
|
+
|
|
202
|
+
extern "C" {{
|
|
203
|
+
|
|
204
|
+
bool openmp_enabled() {{
|
|
205
|
+
return _openmp;
|
|
206
|
+
}}
|
|
207
|
+
|
|
208
|
+
void _inference(int32_t *c_inp, int32_t *c_out, size_t n_samples) {{
|
|
209
|
+
dut_t *dut = new dut_t;
|
|
210
|
+
|
|
211
|
+
size_t clk_req = n_samples * II + latency + 1;
|
|
212
|
+
|
|
213
|
+
for (size_t t_inp = 0; t_inp < clk_req; ++t_inp) {{
|
|
214
|
+
size_t t_out = t_inp - latency - 1;
|
|
215
|
+
|
|
216
|
+
if (t_inp < n_samples * II && t_inp % II == 0) {{
|
|
217
|
+
write_input<N_inp, max_inp_bw>(dut->inp, &c_inp[t_inp / II * N_inp]);
|
|
218
|
+
}}
|
|
219
|
+
|
|
220
|
+
dut->clk = 0;
|
|
221
|
+
dut->eval();
|
|
222
|
+
|
|
223
|
+
if (t_inp > latency && t_out % II == 0) {{
|
|
224
|
+
read_output<N_out, max_out_bw>(dut->out, &c_out[t_out / II * N_out]);
|
|
225
|
+
}}
|
|
226
|
+
|
|
227
|
+
dut->clk = 1;
|
|
228
|
+
dut->eval();
|
|
229
|
+
}}
|
|
230
|
+
|
|
231
|
+
dut->final();
|
|
232
|
+
delete dut;
|
|
233
|
+
}}
|
|
234
|
+
|
|
235
|
+
void inference(int32_t *c_inp, int32_t *c_out, size_t n_samples) {{
|
|
236
|
+
#ifdef _OPENMP
|
|
237
|
+
size_t n_max_threads = omp_get_max_threads();
|
|
238
|
+
size_t n_samples_per_thread = std::max<size_t>(n_samples / n_max_threads, 32);
|
|
239
|
+
size_t n_thread = n_samples / n_samples_per_thread;
|
|
240
|
+
n_thread += (n_samples % n_samples_per_thread) ? 1 : 0;
|
|
241
|
+
|
|
242
|
+
#pragma omp parallel for num_threads(n_thread) schedule(static)
|
|
243
|
+
for (size_t i = 0; i < n_thread; ++i) {{
|
|
244
|
+
size_t start = i * n_samples_per_thread;
|
|
245
|
+
size_t end = std::min<size_t>(start + n_samples_per_thread, n_samples);
|
|
246
|
+
size_t n_samples_this_thread = end - start;
|
|
247
|
+
|
|
248
|
+
_inference(&c_inp[start * N_inp], &c_out[start * N_out], n_samples_this_thread);
|
|
249
|
+
}}
|
|
250
|
+
#else
|
|
251
|
+
_inference(c_inp, c_out, n_samples);
|
|
252
|
+
#endif
|
|
253
|
+
}}
|
|
254
|
+
|
|
255
|
+
}}"""
|
|
@@ -0,0 +1,49 @@
|
|
|
1
|
+
from ...cmvm.types import CascadedSolution, _minimal_kif
|
|
2
|
+
from .comb import comb_logic_gen
|
|
3
|
+
|
|
4
|
+
|
|
5
|
+
def pipeline_logic_gen(
|
|
6
|
+
csol: CascadedSolution, name: str, print_latency=False, timescale: str | None = '`timescale 1 ns / 1 ps', reset_high=True
|
|
7
|
+
):
|
|
8
|
+
N = len(csol.solutions)
|
|
9
|
+
inp_bits = [sum(map(sum, map(_minimal_kif, sol.inp_qint))) for sol in csol.solutions]
|
|
10
|
+
out_bits = inp_bits[1:] + [sum(map(sum, map(_minimal_kif, csol.out_qint)))]
|
|
11
|
+
|
|
12
|
+
registers = [f'reg [{width}-1:0] stage{i}_inp;' for i, width in enumerate(inp_bits)]
|
|
13
|
+
wires = [f'wire [{width}-1:0] stage{i}_out;' for i, width in enumerate(out_bits)]
|
|
14
|
+
|
|
15
|
+
comb_logic = [f'{name}_stage{i} stage{i} (.inp(stage{i}_inp), .out(stage{i}_out));' for i in range(N)]
|
|
16
|
+
|
|
17
|
+
serial_logic = ['stage0_inp <= inp;']
|
|
18
|
+
serial_logic += [f'stage{i}_inp <= stage{i-1}_out;' for i in range(1, N)]
|
|
19
|
+
serial_logic += [f'out <= stage{N-1}_out;']
|
|
20
|
+
|
|
21
|
+
sep0 = '\n '
|
|
22
|
+
sep1 = '\n '
|
|
23
|
+
|
|
24
|
+
module = f"""module {name} (
|
|
25
|
+
input clk,
|
|
26
|
+
input [{inp_bits[0]-1}:0] inp,
|
|
27
|
+
output reg [{out_bits[-1]-1}:0] out
|
|
28
|
+
);
|
|
29
|
+
|
|
30
|
+
{sep0.join(registers)}
|
|
31
|
+
{sep0.join(wires)}
|
|
32
|
+
|
|
33
|
+
{sep0.join(comb_logic)}
|
|
34
|
+
|
|
35
|
+
always @(posedge clk) begin
|
|
36
|
+
{sep1.join(serial_logic)}
|
|
37
|
+
end
|
|
38
|
+
endmodule
|
|
39
|
+
"""
|
|
40
|
+
|
|
41
|
+
if timescale:
|
|
42
|
+
module = f'{timescale}\n\n{module}'
|
|
43
|
+
|
|
44
|
+
ret: dict[str, str] = {}
|
|
45
|
+
for i, s in enumerate(csol.solutions):
|
|
46
|
+
stage_name = f'{name}_stage{i}'
|
|
47
|
+
ret[stage_name] = comb_logic_gen(s, stage_name, print_latency=print_latency, timescale=timescale)
|
|
48
|
+
ret[name] = module
|
|
49
|
+
return ret
|
|
@@ -0,0 +1,27 @@
|
|
|
1
|
+
default: slow
|
|
2
|
+
|
|
3
|
+
VERILATOR_ROOT = $(shell verilator -V | grep -a VERILATOR_ROOT | tail -1 | awk '{{print $$3}}')
|
|
4
|
+
INCLUDES = -I./obj_dir -I$(VERILATOR_ROOT)/include
|
|
5
|
+
WARNINGS = -Wl,--no-undefined
|
|
6
|
+
CFLAGS = -std=c++17 -fPIC
|
|
7
|
+
LINKFLAGS = $(INCLUDES) $(WARNINGS)
|
|
8
|
+
LIBNAME = lib$(VM_PREFIX)_$(STAMP).so
|
|
9
|
+
N_JOBS ?= $(shell nproc)
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
./obj_dir/libV$(VM_PREFIX).a ./obj_dir/libverilated.a ./obj_dir/V$(VM_PREFIX)__ALL.a: $(VM_PREFIX).v
|
|
13
|
+
verilator --cc -j $(N_JOBS) -Wall -build $(VM_PREFIX).v --prefix V$(VM_PREFIX) -CFLAGS "$(CFLAGS)"
|
|
14
|
+
|
|
15
|
+
$(LIBNAME): ./obj_dir/libV$(VM_PREFIX).a ./obj_dir/libverilated.a ./obj_dir/V$(VM_PREFIX)__ALL.a $(VM_PREFIX)_binder.cc
|
|
16
|
+
$(CXX) $(CFLAGS) $(LINKFLAGS) $(CXXFLAGS2) -pthread -shared -o $(LIBNAME) $(VM_PREFIX)_binder.cc ./obj_dir/libV$(VM_PREFIX).a ./obj_dir/libverilated.a ./obj_dir/V$(VM_PREFIX)__ALL.a $(EXTRA_CXXFLAGS)
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
fast: CFLAGS += -O3
|
|
20
|
+
fast: $(LIBNAME)
|
|
21
|
+
|
|
22
|
+
slow: CFLAGS += -O
|
|
23
|
+
slow: $(LIBNAME)
|
|
24
|
+
|
|
25
|
+
clean:
|
|
26
|
+
rm -rf obj_dir
|
|
27
|
+
rm -f $(LIBNAME)
|
|
@@ -0,0 +1,75 @@
|
|
|
1
|
+
set project_name "${PROJECT_NAME}"
|
|
2
|
+
set device "${DEVICE}"
|
|
3
|
+
|
|
4
|
+
set top_module "${project_name}_wrapper"
|
|
5
|
+
set output_dir "./output_${project_name}"
|
|
6
|
+
|
|
7
|
+
create_project $project_name "${output_dir}/$project_name" -force -part $device
|
|
8
|
+
|
|
9
|
+
set_property TARGET_LANGUAGE Verilog [current_project]
|
|
10
|
+
set_property DEFAULT_LIB work [current_project]
|
|
11
|
+
|
|
12
|
+
read_verilog "${project_name}_wrapper.v"
|
|
13
|
+
read_verilog "${project_name}.v"
|
|
14
|
+
read_verilog "shift_adder.v"
|
|
15
|
+
foreach file [glob -nocomplain "${project_name}_stage*.v"] {
|
|
16
|
+
read_verilog $file
|
|
17
|
+
}
|
|
18
|
+
|
|
19
|
+
read_xdc "${project_name}.xdc" -mode out_of_context
|
|
20
|
+
|
|
21
|
+
set_property top $top_module [current_fileset]
|
|
22
|
+
|
|
23
|
+
file mkdir $output_dir
|
|
24
|
+
file mkdir "${output_dir}/reports"
|
|
25
|
+
|
|
26
|
+
# synth
|
|
27
|
+
synth_design -top $top_module -mode out_of_context -retiming \
|
|
28
|
+
-flatten_hierarchy rebuilt -resource_sharing auto \
|
|
29
|
+
-keep_equivalent_registers -shreg_min_size 8 \
|
|
30
|
+
-directive AlternateRoutability
|
|
31
|
+
|
|
32
|
+
write_checkpoint -force "${output_dir}/${project_name}_post_synth.dcp"
|
|
33
|
+
|
|
34
|
+
report_timing_summary -file "${output_dir}/reports/${project_name}_post_synth_timing.rpt"
|
|
35
|
+
report_power -file "${output_dir}/reports/${project_name}_post_synth_power.rpt"
|
|
36
|
+
report_utilization -file "${output_dir}/reports/${project_name}_post_synth_util.rpt"
|
|
37
|
+
|
|
38
|
+
# set_property CARRY_REMAP 3 [get_cells -hier -filter {ref_name == CARRY8}]
|
|
39
|
+
|
|
40
|
+
opt_design -directive ExploreSequentialArea
|
|
41
|
+
opt_design -directive ExploreWithRemap
|
|
42
|
+
|
|
43
|
+
report_design_analysis -congestion -file "${output_dir}/reports/${project_name}_post_opt_congestion.rpt"
|
|
44
|
+
|
|
45
|
+
# place
|
|
46
|
+
place_design -directive AltSpreadLogic_high -fanout_opt
|
|
47
|
+
report_design_analysis -congestion -file "${output_dir}/reports/${project_name}_post_place_congestion_initial.rpt"
|
|
48
|
+
|
|
49
|
+
phys_opt_design -directive AggressiveExplore
|
|
50
|
+
write_checkpoint -force "${output_dir}/${project_name}_post_place.dcp"
|
|
51
|
+
|
|
52
|
+
report_design_analysis -congestion -file "${output_dir}/reports/${project_name}_post_place_congestion_final.rpt"
|
|
53
|
+
|
|
54
|
+
report_timing_summary -file "${output_dir}/reports/${project_name}_post_place_timing.rpt"
|
|
55
|
+
report_utilization -hierarchical -file "${output_dir}/reports/${project_name}_post_place_util.rpt"
|
|
56
|
+
|
|
57
|
+
# route
|
|
58
|
+
route_design -directive NoTimingRelaxation
|
|
59
|
+
write_checkpoint -force "${output_dir}/${project_name}_post_route.dcp"
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
report_timing_summary -file "${output_dir}/reports/${project_name}_post_route_timing.rpt"
|
|
63
|
+
report_timing -sort_by group -max_paths 100 -path_type summary -file "${output_dir}/reports/${project_name}_post_route_timing_paths.rpt"
|
|
64
|
+
report_clock_utilization -file "${output_dir}/reports/${project_name}_post_route_clock_util.rpt"
|
|
65
|
+
report_utilization -file "${output_dir}/reports/${project_name}_post_route_util.rpt"
|
|
66
|
+
report_power -file "${output_dir}/reports/${project_name}_post_route_power.rpt"
|
|
67
|
+
report_drc -file "${output_dir}/reports/${project_name}_post_route_drc.rpt"
|
|
68
|
+
|
|
69
|
+
report_utilization -format xml -hierarchical -file "${output_dir}/reports/${project_name}_post_route_util.xml"
|
|
70
|
+
report_power -xpe "${output_dir}/reports/${project_name}_post_route_power.xml"
|
|
71
|
+
|
|
72
|
+
# Generate Verilog netlist for simulation
|
|
73
|
+
# write_verilog -force "${output_dir}/${project_name}_impl_netlist.v" -mode timesim -sdf_anno true
|
|
74
|
+
|
|
75
|
+
puts "Implementation complete. Results saved in ${output_dir}"
|
|
@@ -0,0 +1,117 @@
|
|
|
1
|
+
#include "verilated.h"
|
|
2
|
+
#include <cassert>
|
|
3
|
+
#include <cstdint>
|
|
4
|
+
#include <vector>
|
|
5
|
+
template <size_t bw, size_t N_in> std::vector<int32_t> bitpack(const int32_t *values) {
|
|
6
|
+
static_assert(bw > 0 && bw <= 32, "Bit width must be between 1 and 32");
|
|
7
|
+
|
|
8
|
+
constexpr size_t total_bits = N_in * bw;
|
|
9
|
+
constexpr size_t result_size = (total_bits + 31) / 32;
|
|
10
|
+
std::vector<int32_t> result(result_size, 0);
|
|
11
|
+
|
|
12
|
+
constexpr uint32_t mask = (bw == 32) ? 0xFFFFFFFF : ((1U << bw) - 1);
|
|
13
|
+
|
|
14
|
+
size_t bit_pos = 0;
|
|
15
|
+
for (size_t i = 0; i < N_in; i++) {
|
|
16
|
+
int32_t val = values[i];
|
|
17
|
+
uint32_t bits = val & mask;
|
|
18
|
+
|
|
19
|
+
size_t result_idx = bit_pos / 32;
|
|
20
|
+
size_t offset = bit_pos % 32;
|
|
21
|
+
|
|
22
|
+
// base case
|
|
23
|
+
result[result_idx] |= (bits << offset);
|
|
24
|
+
|
|
25
|
+
// cross boundary case
|
|
26
|
+
if (offset + bw > 32 && result_idx + 1 < result.size()) {
|
|
27
|
+
result[result_idx + 1] |= (bits >> (32 - offset));
|
|
28
|
+
}
|
|
29
|
+
|
|
30
|
+
bit_pos += bw;
|
|
31
|
+
}
|
|
32
|
+
|
|
33
|
+
return result;
|
|
34
|
+
}
|
|
35
|
+
|
|
36
|
+
template <size_t bw, size_t N_out> std::vector<int32_t> bitunpack(const std::vector<int32_t> &packed) {
|
|
37
|
+
static_assert(bw > 0 && bw <= 32, "Bit width must be between 1 and 32");
|
|
38
|
+
|
|
39
|
+
constexpr size_t total_bits = N_out * bw;
|
|
40
|
+
constexpr size_t packed_size = (total_bits + 31) / 32;
|
|
41
|
+
assert(packed.size() == packed_size);
|
|
42
|
+
|
|
43
|
+
std::vector<int32_t> result(N_out, 0);
|
|
44
|
+
|
|
45
|
+
for (size_t i = 0; i < N_out; i++) {
|
|
46
|
+
size_t bit_pos = i * bw;
|
|
47
|
+
size_t packed_idx = bit_pos / 32;
|
|
48
|
+
size_t offset = bit_pos % 32;
|
|
49
|
+
|
|
50
|
+
// base case
|
|
51
|
+
size_t bw_v0 = std::min(bw, 32 - offset);
|
|
52
|
+
uint32_t mask = bw_v0 == 32 ? 0xFFFFFFFF : ((1U << bw_v0) - 1);
|
|
53
|
+
int32_t value = (packed[packed_idx] >> offset) & mask;
|
|
54
|
+
|
|
55
|
+
// cross boundary
|
|
56
|
+
if (offset + bw > 32) {
|
|
57
|
+
assert(packed_idx + 1 < packed.size());
|
|
58
|
+
size_t bw_v1 = offset + bw - 32;
|
|
59
|
+
uint32_t mask_v1 = ((1U << bw_v1) - 1);
|
|
60
|
+
uint32_t additional_bits = packed[packed_idx + 1] & mask_v1;
|
|
61
|
+
value |= (additional_bits << bw_v0);
|
|
62
|
+
}
|
|
63
|
+
|
|
64
|
+
result[i] = value;
|
|
65
|
+
}
|
|
66
|
+
|
|
67
|
+
return result;
|
|
68
|
+
}
|
|
69
|
+
|
|
70
|
+
template <size_t bits_in, typename inp_buf_t>
|
|
71
|
+
std::enable_if_t<std::is_integral_v<inp_buf_t>, void> _write_input(inp_buf_t &inp_buf, const std::vector<int32_t> &input) {
|
|
72
|
+
assert(input.size() == (bits_in + 31) / 32);
|
|
73
|
+
inp_buf = input[0] & 0xFFFFFFFF;
|
|
74
|
+
if (bits_in > 32) {
|
|
75
|
+
inp_buf |= static_cast<int64_t>(input[1]) << 32;
|
|
76
|
+
}
|
|
77
|
+
}
|
|
78
|
+
|
|
79
|
+
template <size_t bits_in, size_t N_in> void _write_input(VlWide<N_in> &inp_buf, const std::vector<int32_t> &input) {
|
|
80
|
+
assert(input.size() == (bits_in + 31) / 32);
|
|
81
|
+
for (size_t i = 0; i < input.size(); ++i) {
|
|
82
|
+
inp_buf[i] = input[i];
|
|
83
|
+
}
|
|
84
|
+
}
|
|
85
|
+
|
|
86
|
+
template <size_t bits_out, typename out_buf_t>
|
|
87
|
+
std::enable_if_t<std::is_integral_v<out_buf_t>, std::vector<int32_t>> _read_output(out_buf_t &out_buf) {
|
|
88
|
+
std::vector<int32_t> output((bits_out + 31) / 32);
|
|
89
|
+
output[0] = out_buf & 0xFFFFFFFF;
|
|
90
|
+
if (bits_out > 32) {
|
|
91
|
+
output[1] = (out_buf >> 32) & 0xFFFFFFFF;
|
|
92
|
+
}
|
|
93
|
+
return output;
|
|
94
|
+
}
|
|
95
|
+
|
|
96
|
+
template <size_t bits_out, size_t N_out> std::vector<int32_t> _read_output(VlWide<N_out> out_buf) {
|
|
97
|
+
std::vector<int32_t> output((bits_out + 31) / 32);
|
|
98
|
+
for (size_t i = 0; i < output.size(); ++i) {
|
|
99
|
+
output[i] = out_buf[i] & 0xFFFFFFFF;
|
|
100
|
+
}
|
|
101
|
+
return output;
|
|
102
|
+
}
|
|
103
|
+
|
|
104
|
+
template <size_t N, size_t max_bw, typename inp_buf_t> void write_input(inp_buf_t &inp_buf, const int32_t *c_inp) {
|
|
105
|
+
constexpr size_t bits_in = N * max_bw;
|
|
106
|
+
std::vector<int32_t> input = bitpack<max_bw, N>(c_inp);
|
|
107
|
+
_write_input<bits_in>(inp_buf, input);
|
|
108
|
+
}
|
|
109
|
+
|
|
110
|
+
template <size_t N, size_t max_bw, typename out_buf_t> void read_output(out_buf_t out_buf, int32_t *c_out) {
|
|
111
|
+
constexpr size_t bits_out = N * max_bw;
|
|
112
|
+
std::vector<int32_t> packed = _read_output<bits_out>(out_buf);
|
|
113
|
+
std::vector<int32_t> unpacked = bitunpack<max_bw, N>(packed);
|
|
114
|
+
for (size_t i = 0; i < N; ++i) {
|
|
115
|
+
c_out[i] = unpacked[i];
|
|
116
|
+
}
|
|
117
|
+
}
|
|
@@ -0,0 +1,56 @@
|
|
|
1
|
+
`timescale 1ns / 1ps
|
|
2
|
+
|
|
3
|
+
|
|
4
|
+
module shift_adder #(
|
|
5
|
+
parameter BW_INPUT0 = 32,
|
|
6
|
+
parameter BW_INPUT1 = 32,
|
|
7
|
+
parameter SIGNED0 = 0,
|
|
8
|
+
parameter SIGNED1 = 0,
|
|
9
|
+
parameter BW_OUT = 32,
|
|
10
|
+
parameter SHIFT1 = 0,
|
|
11
|
+
parameter IS_SUB = 0
|
|
12
|
+
) (
|
|
13
|
+
input [BW_INPUT0-1:0] in0,
|
|
14
|
+
input [BW_INPUT1-1:0] in1,
|
|
15
|
+
output [BW_OUT-1:0] out
|
|
16
|
+
);
|
|
17
|
+
|
|
18
|
+
localparam IN0_NEED_BITS = (SHIFT1 < 0) ? BW_INPUT0 - SHIFT1 : BW_INPUT0;
|
|
19
|
+
localparam IN1_NEED_BITS = (SHIFT1 > 0) ? BW_INPUT1 + SHIFT1 : BW_INPUT1;
|
|
20
|
+
localparam EXTRA_PAD = (SIGNED0 != SIGNED1) ? IS_SUB+1 : IS_SUB+0;
|
|
21
|
+
localparam BW_ADD = (IN0_NEED_BITS > IN1_NEED_BITS) ? IN0_NEED_BITS + EXTRA_PAD + 1 : IN1_NEED_BITS + EXTRA_PAD + 1;
|
|
22
|
+
localparam IN0_PAD_LEFT = (SHIFT1 < 0) ? BW_ADD - BW_INPUT0 + SHIFT1 : BW_ADD - BW_INPUT0;
|
|
23
|
+
localparam IN0_PAD_RIGHT = (SHIFT1 < 0) ? -SHIFT1 : 0;
|
|
24
|
+
localparam IN1_PAD_LEFT = (SHIFT1 > 0) ? BW_ADD - BW_INPUT1 - SHIFT1 : BW_ADD - BW_INPUT1;
|
|
25
|
+
localparam IN1_PAD_RIGHT = (SHIFT1 > 0) ? SHIFT1 : 0;
|
|
26
|
+
|
|
27
|
+
wire [BW_ADD-1:0] in0_ext;
|
|
28
|
+
wire [BW_ADD-1:0] in1_ext;
|
|
29
|
+
|
|
30
|
+
// verilator lint_off UNUSEDSIGNAL
|
|
31
|
+
wire [BW_ADD-1:0] accum;
|
|
32
|
+
// verilator lint_on UNUSEDSIGNAL
|
|
33
|
+
|
|
34
|
+
generate
|
|
35
|
+
if (SIGNED0 == 1) begin : in0_is_signed
|
|
36
|
+
assign in0_ext = {{IN0_PAD_LEFT{in0[BW_INPUT0-1]}}, in0, {IN0_PAD_RIGHT{1'b0}}};
|
|
37
|
+
end else begin : in0_is_unsigned
|
|
38
|
+
assign in0_ext = {{IN0_PAD_LEFT{1'b0}}, in0, {IN0_PAD_RIGHT{1'b0}}};
|
|
39
|
+
end
|
|
40
|
+
if (SIGNED1 == 1) begin : in1_is_signed
|
|
41
|
+
assign in1_ext = {{IN1_PAD_LEFT{in1[BW_INPUT1-1]}}, in1, {IN1_PAD_RIGHT{1'b0}}};
|
|
42
|
+
end else begin : in1_is_unsigned
|
|
43
|
+
assign in1_ext = {{IN1_PAD_LEFT{1'b0}}, in1, {IN1_PAD_RIGHT{1'b0}}};
|
|
44
|
+
end
|
|
45
|
+
endgenerate
|
|
46
|
+
|
|
47
|
+
generate
|
|
48
|
+
if (IS_SUB == 1) begin : is_sub
|
|
49
|
+
assign accum = in0_ext - in1_ext;
|
|
50
|
+
end else begin : is_add
|
|
51
|
+
assign accum = in0_ext + in1_ext;
|
|
52
|
+
end
|
|
53
|
+
endgenerate
|
|
54
|
+
assign out = accum[BW_OUT-1:0];
|
|
55
|
+
|
|
56
|
+
endmodule
|
|
@@ -0,0 +1,29 @@
|
|
|
1
|
+
set clock_period ${CLOCK_PERIOD}
|
|
2
|
+
|
|
3
|
+
# Clock uncertainty as percentage of clock period
|
|
4
|
+
set uncertainty_setup_r ${UNCERTAINITY_SETUP}
|
|
5
|
+
set uncertainty_hold_r ${UNCERTAINITY_HOLD}
|
|
6
|
+
set delay_max_r ${DELAY_MAX}
|
|
7
|
+
set delay_min_r ${DELAY_MIN}
|
|
8
|
+
|
|
9
|
+
# Calculate actual uncertainty values
|
|
10
|
+
set uncertainty_setup [expr {$clock_period * $uncertainty_setup_r}]
|
|
11
|
+
set uncertainty_hold [expr {$clock_period * $uncertainty_hold_r}]
|
|
12
|
+
set delay_max [expr {$clock_period * $delay_max_r}]
|
|
13
|
+
set delay_min [expr {$clock_period * $delay_min_r}]
|
|
14
|
+
|
|
15
|
+
# Create clock with variable period
|
|
16
|
+
create_clock -period $clock_period -name sys_clk [get_ports {clk}]
|
|
17
|
+
|
|
18
|
+
# Input/Output constraints
|
|
19
|
+
set_input_delay -clock sys_clk -max $delay_max [get_ports {inp[*]}]
|
|
20
|
+
set_input_delay -clock sys_clk -min $delay_min [get_ports {inp[*]}]
|
|
21
|
+
|
|
22
|
+
set_output_delay -clock sys_clk -max $delay_max [get_ports {out[*]}]
|
|
23
|
+
set_output_delay -clock sys_clk -min $delay_min [get_ports {out[*]}]
|
|
24
|
+
|
|
25
|
+
# Apply calculated uncertainty values
|
|
26
|
+
set_clock_uncertainty -setup $uncertainty_setup [get_clocks sys_clk]
|
|
27
|
+
set_clock_uncertainty -hold $uncertainty_hold [get_clocks sys_clk]
|
|
28
|
+
|
|
29
|
+
set_property HD.CLK_SRC BUFG_X0Y0 [get_ports clk]
|