da4ml 0.1.1__py3-none-any.whl → 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of da4ml might be problematic. Click here for more details.

Files changed (50) hide show
  1. da4ml/__init__.py +16 -16
  2. da4ml/_version.py +2 -2
  3. da4ml/cmvm/__init__.py +3 -34
  4. da4ml/cmvm/api.py +239 -73
  5. da4ml/cmvm/core/__init__.py +222 -0
  6. da4ml/cmvm/core/indexers.py +83 -0
  7. da4ml/cmvm/core/state_opr.py +284 -0
  8. da4ml/cmvm/types.py +569 -0
  9. da4ml/cmvm/util/__init__.py +7 -0
  10. da4ml/cmvm/util/bit_decompose.py +86 -0
  11. da4ml/cmvm/util/mat_decompose.py +121 -0
  12. da4ml/codegen/__init__.py +11 -0
  13. da4ml/codegen/cpp/__init__.py +3 -0
  14. da4ml/codegen/cpp/cpp_codegen.py +148 -0
  15. da4ml/codegen/cpp/source/vitis.h +30 -0
  16. da4ml/codegen/cpp/source/vitis_bridge.h +17 -0
  17. da4ml/codegen/verilog/__init__.py +13 -0
  18. da4ml/codegen/verilog/comb.py +146 -0
  19. da4ml/codegen/verilog/io_wrapper.py +255 -0
  20. da4ml/codegen/verilog/pipeline.py +49 -0
  21. da4ml/codegen/verilog/source/build_binder.mk +27 -0
  22. da4ml/codegen/verilog/source/build_prj.tcl +75 -0
  23. da4ml/codegen/verilog/source/ioutils.hh +117 -0
  24. da4ml/codegen/verilog/source/shift_adder.v +56 -0
  25. da4ml/codegen/verilog/source/template.xdc +29 -0
  26. da4ml/codegen/verilog/verilog_model.py +265 -0
  27. da4ml/trace/__init__.py +6 -0
  28. da4ml/trace/fixed_variable.py +358 -0
  29. da4ml/trace/fixed_variable_array.py +177 -0
  30. da4ml/trace/ops/__init__.py +55 -0
  31. da4ml/trace/ops/conv_utils.py +104 -0
  32. da4ml/trace/ops/einsum_utils.py +299 -0
  33. da4ml/trace/pipeline.py +155 -0
  34. da4ml/trace/tracer.py +120 -0
  35. da4ml-0.2.0.dist-info/METADATA +65 -0
  36. da4ml-0.2.0.dist-info/RECORD +39 -0
  37. {da4ml-0.1.1.dist-info → da4ml-0.2.0.dist-info}/WHEEL +1 -1
  38. da4ml/cmvm/balanced_reduction.py +0 -46
  39. da4ml/cmvm/cmvm.py +0 -328
  40. da4ml/cmvm/codegen.py +0 -159
  41. da4ml/cmvm/csd.py +0 -73
  42. da4ml/cmvm/fixed_variable.py +0 -205
  43. da4ml/cmvm/graph_compile.py +0 -85
  44. da4ml/cmvm/nb_fixed_precision.py +0 -98
  45. da4ml/cmvm/scoring.py +0 -55
  46. da4ml/cmvm/utils.py +0 -5
  47. da4ml-0.1.1.dist-info/METADATA +0 -121
  48. da4ml-0.1.1.dist-info/RECORD +0 -18
  49. {da4ml-0.1.1.dist-info → da4ml-0.2.0.dist-info/licenses}/LICENSE +0 -0
  50. {da4ml-0.1.1.dist-info → da4ml-0.2.0.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,255 @@
1
+ from itertools import accumulate
2
+
3
+ from ...cmvm.types import CascadedSolution, QInterval, Solution, _minimal_kif
4
+
5
+
6
+ def hetero_io_map(qints: list[QInterval], merge: bool = False):
7
+ N = len(qints)
8
+ ks, _is, fs = zip(*map(_minimal_kif, qints))
9
+ Is = [_i + _k for _i, _k in zip(_is, ks)]
10
+ max_I, max_f = max(Is), max(fs)
11
+ max_bw = max_I + max_f
12
+ width_regular, width_packed = max_bw * N, sum(Is) + sum(fs)
13
+
14
+ regular: list[tuple[int, int]] = []
15
+ pads: list[tuple[int, int, int]] = []
16
+
17
+ bws = [I + f for I, f in zip(Is, fs)]
18
+ _bw = list(accumulate([0] + bws))
19
+ hetero = [(i - 1, j) for i, j in zip(_bw[1:], _bw[:-1])]
20
+
21
+ for i in range(N):
22
+ base = max_bw * i
23
+ bias_low = max_f - fs[i]
24
+ bias_high = max_I - Is[i]
25
+ low = base + bias_low
26
+ high = (base + max_bw - 1) - bias_high
27
+ regular.append((high, low))
28
+
29
+ if bias_low != 0:
30
+ pads.append((base + bias_low - 1, base, -1))
31
+ if bias_high != 0:
32
+ copy_from = hetero[i][0] if ks[i] else -1
33
+ pads.append((base + max_bw - 1, base + max_bw - bias_high, copy_from))
34
+
35
+ if not merge:
36
+ return regular, hetero, pads, (width_regular, width_packed)
37
+
38
+ # Merging consecutive intervals when possible
39
+ for i in range(N - 2, -1, -1):
40
+ this_high = regular[i][0]
41
+ next_low = regular[i + 1][1]
42
+ if next_low - this_high != 1:
43
+ continue
44
+ regular[i] = (regular[i + 1][0], regular[i][1])
45
+ regular.pop(i + 1)
46
+ hetero[i] = (hetero[i + 1][0], hetero[i][1])
47
+ hetero.pop(i + 1)
48
+
49
+ for i in range(len(pads) - 2, -1, -1):
50
+ if pads[i + 1][1] - pads[i][0] == 1 and pads[i][2] == pads[i + 1][2]:
51
+ pads[i] = (pads[i + 1][0], pads[i][1], pads[i][2])
52
+ pads.pop(i + 1)
53
+
54
+ return regular, hetero, pads, (width_regular, width_packed)
55
+
56
+
57
+ def generate_io_wrapper(sol: Solution | CascadedSolution, module_name: str, pipelined: bool = False):
58
+ reg_in, het_in, _, shape_in = hetero_io_map(sol.inp_qint, merge=True)
59
+ reg_out, het_out, pad_out, shape_out = hetero_io_map(sol.out_qint, merge=True)
60
+
61
+ w_reg_in, w_het_in = shape_in
62
+ w_reg_out, w_het_out = shape_out
63
+
64
+ inp_assignment = [f'assign packed_inp[{ih}:{jh}] = inp[{ir}:{jr}];' for (ih, jh), (ir, jr) in zip(het_in, reg_in)]
65
+ _out_assignment: list[tuple[int, str]] = []
66
+
67
+ for i, ((ih, jh), (ir, jr)) in enumerate(zip(het_out, reg_out)):
68
+ _out_assignment.append((ih, f'assign out[{ir}:{jr}] = packed_out[{ih}:{jh}];'))
69
+
70
+ for i, (i, j, copy_from) in enumerate(pad_out):
71
+ n_bit = i - j + 1
72
+ pad = f"{n_bit}'b0" if copy_from == -1 else f'{{{n_bit}{{packed_out[{copy_from}]}}}}'
73
+ _out_assignment.append((i, f'assign out[{i}:{j}] = {pad};'))
74
+ _out_assignment.sort(key=lambda x: x[0])
75
+ out_assignment = [v for _, v in _out_assignment]
76
+
77
+ inp_assignment_str = '\n '.join(inp_assignment)
78
+ out_assignment_str = '\n '.join(out_assignment)
79
+
80
+ clk_and_rst_inp, clk_and_rst_bind = '', ''
81
+ if pipelined:
82
+ clk_and_rst_inp = '\n input clk,'
83
+ clk_and_rst_bind = '\n .clk(clk),'
84
+
85
+ return f"""`timescale 1 ns / 1 ps
86
+
87
+ module {module_name}_wrapper ({clk_and_rst_inp}
88
+ // verilator lint_off UNUSEDSIGNAL
89
+ input [{w_reg_in-1}:0] inp,
90
+ // verilator lint_on UNUSEDSIGNAL
91
+ output [{w_reg_out-1}:0] out
92
+ );
93
+ wire [{w_het_in-1}:0] packed_inp;
94
+ wire [{w_het_out-1}:0] packed_out;
95
+
96
+ {inp_assignment_str}
97
+
98
+ {module_name} op ({clk_and_rst_bind}
99
+ .inp(packed_inp),
100
+ .out(packed_out)
101
+ );
102
+
103
+ {out_assignment_str}
104
+
105
+ endmodule
106
+ """
107
+
108
+
109
+ def comb_binder_gen(sol: Solution, module_name: str):
110
+ k_in, i_in, f_in = zip(*map(_minimal_kif, sol.inp_qint))
111
+ k_out, i_out, f_out = zip(*map(_minimal_kif, sol.out_qint))
112
+ max_inp_bw = max(k + i for k, i in zip(k_in, i_in)) + max(f_in)
113
+ max_out_bw = max(k + i for k, i in zip(k_out, i_out)) + max(f_out)
114
+
115
+ n_in, n_out = sol.shape
116
+ return f"""#include "V{module_name}.h"
117
+ #include "ioutils.hh"
118
+ #include <verilated.h>
119
+
120
+ #ifdef _OPENMP
121
+ #include <omp.h>
122
+ constexpr bool _openmp = true;
123
+ #else
124
+ constexpr bool _openmp = false;
125
+ #endif
126
+
127
+ constexpr size_t N_inp = {n_in};
128
+ constexpr size_t N_out = {n_out};
129
+ constexpr size_t max_inp_bw = {max_inp_bw};
130
+ constexpr size_t max_out_bw = {max_out_bw};
131
+ typedef V{module_name} dut_t;
132
+
133
+ extern "C" {{
134
+
135
+ bool openmp_enabled() {{
136
+ return _openmp;
137
+ }}
138
+
139
+ void _inference(int32_t *c_inp, int32_t *c_out, size_t n_samples) {{
140
+ dut_t *dut = new dut_t;
141
+
142
+ for (size_t i = 0; i < n_samples; ++i) {{
143
+ write_input<N_inp, max_inp_bw>(dut->inp, &c_inp[i * N_inp]);
144
+ dut->eval();
145
+ read_output<N_out, max_out_bw>(dut->out, &c_out[i * N_out]);
146
+ }}
147
+
148
+ dut->final();
149
+ delete dut;
150
+ }}
151
+
152
+ void inference(int32_t *c_inp, int32_t *c_out, size_t n_samples) {{
153
+ size_t n_max_threads = omp_get_max_threads();
154
+ size_t n_samples_per_thread = std::max<size_t>(n_samples / n_max_threads, 32);
155
+ size_t n_thread = n_samples / n_samples_per_thread;
156
+ n_thread += (n_samples % n_samples_per_thread) ? 1 : 0;
157
+
158
+ #ifdef _OPENMP
159
+ #pragma omp parallel for num_threads(n_thread) schedule(static)
160
+ for (size_t i = 0; i < n_thread; ++i) {{
161
+ size_t start = i * n_samples_per_thread;
162
+ size_t end = std::min<size_t>(start + n_samples_per_thread, n_samples);
163
+ size_t n_samples_this_thread = end - start;
164
+
165
+ _inference(&c_inp[start * N_inp], &c_out[start * N_out], n_samples_this_thread);
166
+ }}
167
+ #else
168
+ _inference(c_inp, c_out, n_samples);
169
+ #endif
170
+ }}
171
+ }}"""
172
+
173
+
174
+ def pipeline_binder_gen(csol: CascadedSolution, module_name: str, II: int = 1):
175
+ k_in, i_in, f_in = zip(*map(_minimal_kif, csol.inp_qint))
176
+ k_out, i_out, f_out = zip(*map(_minimal_kif, csol.out_qint))
177
+ max_inp_bw = max(k + i for k, i in zip(k_in, i_in)) + max(f_in)
178
+ max_out_bw = max(k + i for k, i in zip(k_out, i_out)) + max(f_out)
179
+
180
+ n_stage = len(csol.solutions)
181
+
182
+ n_in, n_out = csol.shape
183
+ return f"""#include "V{module_name}.h"
184
+ #include "ioutils.hh"
185
+ #include <verilated.h>
186
+
187
+ #ifdef _OPENMP
188
+ #include <omp.h>
189
+ constexpr bool _openmp = true;
190
+ #else
191
+ constexpr bool _openmp = false;
192
+ #endif
193
+
194
+ constexpr size_t N_inp = {n_in};
195
+ constexpr size_t N_out = {n_out};
196
+ constexpr size_t max_inp_bw = {max_inp_bw};
197
+ constexpr size_t max_out_bw = {max_out_bw};
198
+ constexpr size_t II = {II};
199
+ constexpr size_t latency = {n_stage};
200
+ typedef V{module_name} dut_t;
201
+
202
+ extern "C" {{
203
+
204
+ bool openmp_enabled() {{
205
+ return _openmp;
206
+ }}
207
+
208
+ void _inference(int32_t *c_inp, int32_t *c_out, size_t n_samples) {{
209
+ dut_t *dut = new dut_t;
210
+
211
+ size_t clk_req = n_samples * II + latency + 1;
212
+
213
+ for (size_t t_inp = 0; t_inp < clk_req; ++t_inp) {{
214
+ size_t t_out = t_inp - latency - 1;
215
+
216
+ if (t_inp < n_samples * II && t_inp % II == 0) {{
217
+ write_input<N_inp, max_inp_bw>(dut->inp, &c_inp[t_inp / II * N_inp]);
218
+ }}
219
+
220
+ dut->clk = 0;
221
+ dut->eval();
222
+
223
+ if (t_inp > latency && t_out % II == 0) {{
224
+ read_output<N_out, max_out_bw>(dut->out, &c_out[t_out / II * N_out]);
225
+ }}
226
+
227
+ dut->clk = 1;
228
+ dut->eval();
229
+ }}
230
+
231
+ dut->final();
232
+ delete dut;
233
+ }}
234
+
235
+ void inference(int32_t *c_inp, int32_t *c_out, size_t n_samples) {{
236
+ #ifdef _OPENMP
237
+ size_t n_max_threads = omp_get_max_threads();
238
+ size_t n_samples_per_thread = std::max<size_t>(n_samples / n_max_threads, 32);
239
+ size_t n_thread = n_samples / n_samples_per_thread;
240
+ n_thread += (n_samples % n_samples_per_thread) ? 1 : 0;
241
+
242
+ #pragma omp parallel for num_threads(n_thread) schedule(static)
243
+ for (size_t i = 0; i < n_thread; ++i) {{
244
+ size_t start = i * n_samples_per_thread;
245
+ size_t end = std::min<size_t>(start + n_samples_per_thread, n_samples);
246
+ size_t n_samples_this_thread = end - start;
247
+
248
+ _inference(&c_inp[start * N_inp], &c_out[start * N_out], n_samples_this_thread);
249
+ }}
250
+ #else
251
+ _inference(c_inp, c_out, n_samples);
252
+ #endif
253
+ }}
254
+
255
+ }}"""
@@ -0,0 +1,49 @@
1
+ from ...cmvm.types import CascadedSolution, _minimal_kif
2
+ from .comb import comb_logic_gen
3
+
4
+
5
+ def pipeline_logic_gen(
6
+ csol: CascadedSolution, name: str, print_latency=False, timescale: str | None = '`timescale 1 ns / 1 ps', reset_high=True
7
+ ):
8
+ N = len(csol.solutions)
9
+ inp_bits = [sum(map(sum, map(_minimal_kif, sol.inp_qint))) for sol in csol.solutions]
10
+ out_bits = inp_bits[1:] + [sum(map(sum, map(_minimal_kif, csol.out_qint)))]
11
+
12
+ registers = [f'reg [{width}-1:0] stage{i}_inp;' for i, width in enumerate(inp_bits)]
13
+ wires = [f'wire [{width}-1:0] stage{i}_out;' for i, width in enumerate(out_bits)]
14
+
15
+ comb_logic = [f'{name}_stage{i} stage{i} (.inp(stage{i}_inp), .out(stage{i}_out));' for i in range(N)]
16
+
17
+ serial_logic = ['stage0_inp <= inp;']
18
+ serial_logic += [f'stage{i}_inp <= stage{i-1}_out;' for i in range(1, N)]
19
+ serial_logic += [f'out <= stage{N-1}_out;']
20
+
21
+ sep0 = '\n '
22
+ sep1 = '\n '
23
+
24
+ module = f"""module {name} (
25
+ input clk,
26
+ input [{inp_bits[0]-1}:0] inp,
27
+ output reg [{out_bits[-1]-1}:0] out
28
+ );
29
+
30
+ {sep0.join(registers)}
31
+ {sep0.join(wires)}
32
+
33
+ {sep0.join(comb_logic)}
34
+
35
+ always @(posedge clk) begin
36
+ {sep1.join(serial_logic)}
37
+ end
38
+ endmodule
39
+ """
40
+
41
+ if timescale:
42
+ module = f'{timescale}\n\n{module}'
43
+
44
+ ret: dict[str, str] = {}
45
+ for i, s in enumerate(csol.solutions):
46
+ stage_name = f'{name}_stage{i}'
47
+ ret[stage_name] = comb_logic_gen(s, stage_name, print_latency=print_latency, timescale=timescale)
48
+ ret[name] = module
49
+ return ret
@@ -0,0 +1,27 @@
1
+ default: slow
2
+
3
+ VERILATOR_ROOT = $(shell verilator -V | grep -a VERILATOR_ROOT | tail -1 | awk '{{print $$3}}')
4
+ INCLUDES = -I./obj_dir -I$(VERILATOR_ROOT)/include
5
+ WARNINGS = -Wl,--no-undefined
6
+ CFLAGS = -std=c++17 -fPIC
7
+ LINKFLAGS = $(INCLUDES) $(WARNINGS)
8
+ LIBNAME = lib$(VM_PREFIX)_$(STAMP).so
9
+ N_JOBS ?= $(shell nproc)
10
+
11
+
12
+ ./obj_dir/libV$(VM_PREFIX).a ./obj_dir/libverilated.a ./obj_dir/V$(VM_PREFIX)__ALL.a: $(VM_PREFIX).v
13
+ verilator --cc -j $(N_JOBS) -Wall -build $(VM_PREFIX).v --prefix V$(VM_PREFIX) -CFLAGS "$(CFLAGS)"
14
+
15
+ $(LIBNAME): ./obj_dir/libV$(VM_PREFIX).a ./obj_dir/libverilated.a ./obj_dir/V$(VM_PREFIX)__ALL.a $(VM_PREFIX)_binder.cc
16
+ $(CXX) $(CFLAGS) $(LINKFLAGS) $(CXXFLAGS2) -pthread -shared -o $(LIBNAME) $(VM_PREFIX)_binder.cc ./obj_dir/libV$(VM_PREFIX).a ./obj_dir/libverilated.a ./obj_dir/V$(VM_PREFIX)__ALL.a $(EXTRA_CXXFLAGS)
17
+
18
+
19
+ fast: CFLAGS += -O3
20
+ fast: $(LIBNAME)
21
+
22
+ slow: CFLAGS += -O
23
+ slow: $(LIBNAME)
24
+
25
+ clean:
26
+ rm -rf obj_dir
27
+ rm -f $(LIBNAME)
@@ -0,0 +1,75 @@
1
+ set project_name "${PROJECT_NAME}"
2
+ set device "${DEVICE}"
3
+
4
+ set top_module "${project_name}_wrapper"
5
+ set output_dir "./output_${project_name}"
6
+
7
+ create_project $project_name "${output_dir}/$project_name" -force -part $device
8
+
9
+ set_property TARGET_LANGUAGE Verilog [current_project]
10
+ set_property DEFAULT_LIB work [current_project]
11
+
12
+ read_verilog "${project_name}_wrapper.v"
13
+ read_verilog "${project_name}.v"
14
+ read_verilog "shift_adder.v"
15
+ foreach file [glob -nocomplain "${project_name}_stage*.v"] {
16
+ read_verilog $file
17
+ }
18
+
19
+ read_xdc "${project_name}.xdc" -mode out_of_context
20
+
21
+ set_property top $top_module [current_fileset]
22
+
23
+ file mkdir $output_dir
24
+ file mkdir "${output_dir}/reports"
25
+
26
+ # synth
27
+ synth_design -top $top_module -mode out_of_context -retiming \
28
+ -flatten_hierarchy rebuilt -resource_sharing auto \
29
+ -keep_equivalent_registers -shreg_min_size 8 \
30
+ -directive AlternateRoutability
31
+
32
+ write_checkpoint -force "${output_dir}/${project_name}_post_synth.dcp"
33
+
34
+ report_timing_summary -file "${output_dir}/reports/${project_name}_post_synth_timing.rpt"
35
+ report_power -file "${output_dir}/reports/${project_name}_post_synth_power.rpt"
36
+ report_utilization -file "${output_dir}/reports/${project_name}_post_synth_util.rpt"
37
+
38
+ # set_property CARRY_REMAP 3 [get_cells -hier -filter {ref_name == CARRY8}]
39
+
40
+ opt_design -directive ExploreSequentialArea
41
+ opt_design -directive ExploreWithRemap
42
+
43
+ report_design_analysis -congestion -file "${output_dir}/reports/${project_name}_post_opt_congestion.rpt"
44
+
45
+ # place
46
+ place_design -directive AltSpreadLogic_high -fanout_opt
47
+ report_design_analysis -congestion -file "${output_dir}/reports/${project_name}_post_place_congestion_initial.rpt"
48
+
49
+ phys_opt_design -directive AggressiveExplore
50
+ write_checkpoint -force "${output_dir}/${project_name}_post_place.dcp"
51
+
52
+ report_design_analysis -congestion -file "${output_dir}/reports/${project_name}_post_place_congestion_final.rpt"
53
+
54
+ report_timing_summary -file "${output_dir}/reports/${project_name}_post_place_timing.rpt"
55
+ report_utilization -hierarchical -file "${output_dir}/reports/${project_name}_post_place_util.rpt"
56
+
57
+ # route
58
+ route_design -directive NoTimingRelaxation
59
+ write_checkpoint -force "${output_dir}/${project_name}_post_route.dcp"
60
+
61
+
62
+ report_timing_summary -file "${output_dir}/reports/${project_name}_post_route_timing.rpt"
63
+ report_timing -sort_by group -max_paths 100 -path_type summary -file "${output_dir}/reports/${project_name}_post_route_timing_paths.rpt"
64
+ report_clock_utilization -file "${output_dir}/reports/${project_name}_post_route_clock_util.rpt"
65
+ report_utilization -file "${output_dir}/reports/${project_name}_post_route_util.rpt"
66
+ report_power -file "${output_dir}/reports/${project_name}_post_route_power.rpt"
67
+ report_drc -file "${output_dir}/reports/${project_name}_post_route_drc.rpt"
68
+
69
+ report_utilization -format xml -hierarchical -file "${output_dir}/reports/${project_name}_post_route_util.xml"
70
+ report_power -xpe "${output_dir}/reports/${project_name}_post_route_power.xml"
71
+
72
+ # Generate Verilog netlist for simulation
73
+ # write_verilog -force "${output_dir}/${project_name}_impl_netlist.v" -mode timesim -sdf_anno true
74
+
75
+ puts "Implementation complete. Results saved in ${output_dir}"
@@ -0,0 +1,117 @@
1
+ #include "verilated.h"
2
+ #include <cassert>
3
+ #include <cstdint>
4
+ #include <vector>
5
+ template <size_t bw, size_t N_in> std::vector<int32_t> bitpack(const int32_t *values) {
6
+ static_assert(bw > 0 && bw <= 32, "Bit width must be between 1 and 32");
7
+
8
+ constexpr size_t total_bits = N_in * bw;
9
+ constexpr size_t result_size = (total_bits + 31) / 32;
10
+ std::vector<int32_t> result(result_size, 0);
11
+
12
+ constexpr uint32_t mask = (bw == 32) ? 0xFFFFFFFF : ((1U << bw) - 1);
13
+
14
+ size_t bit_pos = 0;
15
+ for (size_t i = 0; i < N_in; i++) {
16
+ int32_t val = values[i];
17
+ uint32_t bits = val & mask;
18
+
19
+ size_t result_idx = bit_pos / 32;
20
+ size_t offset = bit_pos % 32;
21
+
22
+ // base case
23
+ result[result_idx] |= (bits << offset);
24
+
25
+ // cross boundary case
26
+ if (offset + bw > 32 && result_idx + 1 < result.size()) {
27
+ result[result_idx + 1] |= (bits >> (32 - offset));
28
+ }
29
+
30
+ bit_pos += bw;
31
+ }
32
+
33
+ return result;
34
+ }
35
+
36
+ template <size_t bw, size_t N_out> std::vector<int32_t> bitunpack(const std::vector<int32_t> &packed) {
37
+ static_assert(bw > 0 && bw <= 32, "Bit width must be between 1 and 32");
38
+
39
+ constexpr size_t total_bits = N_out * bw;
40
+ constexpr size_t packed_size = (total_bits + 31) / 32;
41
+ assert(packed.size() == packed_size);
42
+
43
+ std::vector<int32_t> result(N_out, 0);
44
+
45
+ for (size_t i = 0; i < N_out; i++) {
46
+ size_t bit_pos = i * bw;
47
+ size_t packed_idx = bit_pos / 32;
48
+ size_t offset = bit_pos % 32;
49
+
50
+ // base case
51
+ size_t bw_v0 = std::min(bw, 32 - offset);
52
+ uint32_t mask = bw_v0 == 32 ? 0xFFFFFFFF : ((1U << bw_v0) - 1);
53
+ int32_t value = (packed[packed_idx] >> offset) & mask;
54
+
55
+ // cross boundary
56
+ if (offset + bw > 32) {
57
+ assert(packed_idx + 1 < packed.size());
58
+ size_t bw_v1 = offset + bw - 32;
59
+ uint32_t mask_v1 = ((1U << bw_v1) - 1);
60
+ uint32_t additional_bits = packed[packed_idx + 1] & mask_v1;
61
+ value |= (additional_bits << bw_v0);
62
+ }
63
+
64
+ result[i] = value;
65
+ }
66
+
67
+ return result;
68
+ }
69
+
70
+ template <size_t bits_in, typename inp_buf_t>
71
+ std::enable_if_t<std::is_integral_v<inp_buf_t>, void> _write_input(inp_buf_t &inp_buf, const std::vector<int32_t> &input) {
72
+ assert(input.size() == (bits_in + 31) / 32);
73
+ inp_buf = input[0] & 0xFFFFFFFF;
74
+ if (bits_in > 32) {
75
+ inp_buf |= static_cast<int64_t>(input[1]) << 32;
76
+ }
77
+ }
78
+
79
+ template <size_t bits_in, size_t N_in> void _write_input(VlWide<N_in> &inp_buf, const std::vector<int32_t> &input) {
80
+ assert(input.size() == (bits_in + 31) / 32);
81
+ for (size_t i = 0; i < input.size(); ++i) {
82
+ inp_buf[i] = input[i];
83
+ }
84
+ }
85
+
86
+ template <size_t bits_out, typename out_buf_t>
87
+ std::enable_if_t<std::is_integral_v<out_buf_t>, std::vector<int32_t>> _read_output(out_buf_t &out_buf) {
88
+ std::vector<int32_t> output((bits_out + 31) / 32);
89
+ output[0] = out_buf & 0xFFFFFFFF;
90
+ if (bits_out > 32) {
91
+ output[1] = (out_buf >> 32) & 0xFFFFFFFF;
92
+ }
93
+ return output;
94
+ }
95
+
96
+ template <size_t bits_out, size_t N_out> std::vector<int32_t> _read_output(VlWide<N_out> out_buf) {
97
+ std::vector<int32_t> output((bits_out + 31) / 32);
98
+ for (size_t i = 0; i < output.size(); ++i) {
99
+ output[i] = out_buf[i] & 0xFFFFFFFF;
100
+ }
101
+ return output;
102
+ }
103
+
104
+ template <size_t N, size_t max_bw, typename inp_buf_t> void write_input(inp_buf_t &inp_buf, const int32_t *c_inp) {
105
+ constexpr size_t bits_in = N * max_bw;
106
+ std::vector<int32_t> input = bitpack<max_bw, N>(c_inp);
107
+ _write_input<bits_in>(inp_buf, input);
108
+ }
109
+
110
+ template <size_t N, size_t max_bw, typename out_buf_t> void read_output(out_buf_t out_buf, int32_t *c_out) {
111
+ constexpr size_t bits_out = N * max_bw;
112
+ std::vector<int32_t> packed = _read_output<bits_out>(out_buf);
113
+ std::vector<int32_t> unpacked = bitunpack<max_bw, N>(packed);
114
+ for (size_t i = 0; i < N; ++i) {
115
+ c_out[i] = unpacked[i];
116
+ }
117
+ }
@@ -0,0 +1,56 @@
1
+ `timescale 1ns / 1ps
2
+
3
+
4
+ module shift_adder #(
5
+ parameter BW_INPUT0 = 32,
6
+ parameter BW_INPUT1 = 32,
7
+ parameter SIGNED0 = 0,
8
+ parameter SIGNED1 = 0,
9
+ parameter BW_OUT = 32,
10
+ parameter SHIFT1 = 0,
11
+ parameter IS_SUB = 0
12
+ ) (
13
+ input [BW_INPUT0-1:0] in0,
14
+ input [BW_INPUT1-1:0] in1,
15
+ output [BW_OUT-1:0] out
16
+ );
17
+
18
+ localparam IN0_NEED_BITS = (SHIFT1 < 0) ? BW_INPUT0 - SHIFT1 : BW_INPUT0;
19
+ localparam IN1_NEED_BITS = (SHIFT1 > 0) ? BW_INPUT1 + SHIFT1 : BW_INPUT1;
20
+ localparam EXTRA_PAD = (SIGNED0 != SIGNED1) ? IS_SUB+1 : IS_SUB+0;
21
+ localparam BW_ADD = (IN0_NEED_BITS > IN1_NEED_BITS) ? IN0_NEED_BITS + EXTRA_PAD + 1 : IN1_NEED_BITS + EXTRA_PAD + 1;
22
+ localparam IN0_PAD_LEFT = (SHIFT1 < 0) ? BW_ADD - BW_INPUT0 + SHIFT1 : BW_ADD - BW_INPUT0;
23
+ localparam IN0_PAD_RIGHT = (SHIFT1 < 0) ? -SHIFT1 : 0;
24
+ localparam IN1_PAD_LEFT = (SHIFT1 > 0) ? BW_ADD - BW_INPUT1 - SHIFT1 : BW_ADD - BW_INPUT1;
25
+ localparam IN1_PAD_RIGHT = (SHIFT1 > 0) ? SHIFT1 : 0;
26
+
27
+ wire [BW_ADD-1:0] in0_ext;
28
+ wire [BW_ADD-1:0] in1_ext;
29
+
30
+ // verilator lint_off UNUSEDSIGNAL
31
+ wire [BW_ADD-1:0] accum;
32
+ // verilator lint_on UNUSEDSIGNAL
33
+
34
+ generate
35
+ if (SIGNED0 == 1) begin : in0_is_signed
36
+ assign in0_ext = {{IN0_PAD_LEFT{in0[BW_INPUT0-1]}}, in0, {IN0_PAD_RIGHT{1'b0}}};
37
+ end else begin : in0_is_unsigned
38
+ assign in0_ext = {{IN0_PAD_LEFT{1'b0}}, in0, {IN0_PAD_RIGHT{1'b0}}};
39
+ end
40
+ if (SIGNED1 == 1) begin : in1_is_signed
41
+ assign in1_ext = {{IN1_PAD_LEFT{in1[BW_INPUT1-1]}}, in1, {IN1_PAD_RIGHT{1'b0}}};
42
+ end else begin : in1_is_unsigned
43
+ assign in1_ext = {{IN1_PAD_LEFT{1'b0}}, in1, {IN1_PAD_RIGHT{1'b0}}};
44
+ end
45
+ endgenerate
46
+
47
+ generate
48
+ if (IS_SUB == 1) begin : is_sub
49
+ assign accum = in0_ext - in1_ext;
50
+ end else begin : is_add
51
+ assign accum = in0_ext + in1_ext;
52
+ end
53
+ endgenerate
54
+ assign out = accum[BW_OUT-1:0];
55
+
56
+ endmodule
@@ -0,0 +1,29 @@
1
+ set clock_period ${CLOCK_PERIOD}
2
+
3
+ # Clock uncertainty as percentage of clock period
4
+ set uncertainty_setup_r ${UNCERTAINITY_SETUP}
5
+ set uncertainty_hold_r ${UNCERTAINITY_HOLD}
6
+ set delay_max_r ${DELAY_MAX}
7
+ set delay_min_r ${DELAY_MIN}
8
+
9
+ # Calculate actual uncertainty values
10
+ set uncertainty_setup [expr {$clock_period * $uncertainty_setup_r}]
11
+ set uncertainty_hold [expr {$clock_period * $uncertainty_hold_r}]
12
+ set delay_max [expr {$clock_period * $delay_max_r}]
13
+ set delay_min [expr {$clock_period * $delay_min_r}]
14
+
15
+ # Create clock with variable period
16
+ create_clock -period $clock_period -name sys_clk [get_ports {clk}]
17
+
18
+ # Input/Output constraints
19
+ set_input_delay -clock sys_clk -max $delay_max [get_ports {inp[*]}]
20
+ set_input_delay -clock sys_clk -min $delay_min [get_ports {inp[*]}]
21
+
22
+ set_output_delay -clock sys_clk -max $delay_max [get_ports {out[*]}]
23
+ set_output_delay -clock sys_clk -min $delay_min [get_ports {out[*]}]
24
+
25
+ # Apply calculated uncertainty values
26
+ set_clock_uncertainty -setup $uncertainty_setup [get_clocks sys_clk]
27
+ set_clock_uncertainty -hold $uncertainty_hold [get_clocks sys_clk]
28
+
29
+ set_property HD.CLK_SRC BUFG_X0Y0 [get_ports clk]