da4ml 0.2.0__py3-none-any.whl → 0.3.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of da4ml might be problematic. Click here for more details.

Files changed (59) hide show
  1. da4ml/_version.py +2 -2
  2. da4ml/cmvm/api.py +2 -6
  3. da4ml/cmvm/core/__init__.py +0 -1
  4. da4ml/cmvm/types.py +99 -19
  5. da4ml/codegen/__init__.py +5 -4
  6. da4ml/codegen/cpp/__init__.py +2 -1
  7. da4ml/codegen/cpp/cpp_codegen.py +58 -25
  8. da4ml/codegen/cpp/hls_model.py +252 -0
  9. da4ml/codegen/cpp/source/ap_types/ap_binary.h +78 -0
  10. da4ml/codegen/cpp/source/ap_types/ap_common.h +376 -0
  11. da4ml/codegen/cpp/source/ap_types/ap_decl.h +212 -0
  12. da4ml/codegen/cpp/source/ap_types/ap_fixed.h +360 -0
  13. da4ml/codegen/cpp/source/ap_types/ap_fixed_base.h +2354 -0
  14. da4ml/codegen/cpp/source/ap_types/ap_fixed_ref.h +718 -0
  15. da4ml/codegen/cpp/source/ap_types/ap_fixed_special.h +230 -0
  16. da4ml/codegen/cpp/source/ap_types/ap_int.h +330 -0
  17. da4ml/codegen/cpp/source/ap_types/ap_int_base.h +1885 -0
  18. da4ml/codegen/cpp/source/ap_types/ap_int_ref.h +1346 -0
  19. da4ml/codegen/cpp/source/ap_types/ap_int_special.h +223 -0
  20. da4ml/codegen/cpp/source/ap_types/ap_shift_reg.h +138 -0
  21. da4ml/codegen/cpp/source/ap_types/etc/ap_private.h +7199 -0
  22. da4ml/codegen/cpp/source/ap_types/hls_math.h +27 -0
  23. da4ml/codegen/cpp/source/ap_types/hls_stream.h +263 -0
  24. da4ml/codegen/cpp/source/ap_types/utils/x_hls_utils.h +80 -0
  25. da4ml/codegen/cpp/source/binder_util.hh +56 -0
  26. da4ml/codegen/cpp/source/build_binder.mk +24 -0
  27. da4ml/codegen/cpp/source/{vitis.h → vitis_bitshift.hh} +1 -1
  28. da4ml/codegen/verilog/__init__.py +2 -3
  29. da4ml/codegen/verilog/comb.py +65 -24
  30. da4ml/codegen/verilog/io_wrapper.py +36 -141
  31. da4ml/codegen/verilog/pipeline.py +21 -3
  32. da4ml/codegen/verilog/source/binder_util.hh +72 -0
  33. da4ml/codegen/verilog/source/build_prj.tcl +0 -1
  34. da4ml/codegen/verilog/source/mux.v +58 -0
  35. da4ml/codegen/verilog/source/negative.v +28 -0
  36. da4ml/codegen/verilog/source/shift_adder.v +4 -1
  37. da4ml/codegen/verilog/source/template.xdc +3 -0
  38. da4ml/codegen/verilog/verilog_model.py +42 -15
  39. da4ml/converter/__init__.py +0 -0
  40. da4ml/converter/hgq2/parser.py +105 -0
  41. da4ml/converter/hgq2/replica.py +383 -0
  42. da4ml/trace/__init__.py +2 -2
  43. da4ml/trace/fixed_variable.py +177 -18
  44. da4ml/trace/fixed_variable_array.py +124 -9
  45. da4ml/trace/ops/__init__.py +22 -6
  46. da4ml/trace/ops/conv_utils.py +146 -14
  47. da4ml/trace/ops/einsum_utils.py +9 -6
  48. da4ml/trace/ops/reduce_utils.py +103 -0
  49. da4ml/trace/pipeline.py +36 -34
  50. da4ml/trace/tracer.py +37 -5
  51. da4ml-0.3.0.dist-info/METADATA +107 -0
  52. da4ml-0.3.0.dist-info/RECORD +64 -0
  53. da4ml/codegen/cpp/source/vitis_bridge.h +0 -17
  54. da4ml-0.2.0.dist-info/METADATA +0 -65
  55. da4ml-0.2.0.dist-info/RECORD +0 -39
  56. /da4ml/codegen/verilog/source/{ioutils.hh → ioutil.hh} +0 -0
  57. {da4ml-0.2.0.dist-info → da4ml-0.3.0.dist-info}/WHEEL +0 -0
  58. {da4ml-0.2.0.dist-info → da4ml-0.3.0.dist-info}/licenses/LICENSE +0 -0
  59. {da4ml-0.2.0.dist-info → da4ml-0.3.0.dist-info}/top_level.txt +0 -0
@@ -7,7 +7,7 @@ def hetero_io_map(qints: list[QInterval], merge: bool = False):
7
7
  N = len(qints)
8
8
  ks, _is, fs = zip(*map(_minimal_kif, qints))
9
9
  Is = [_i + _k for _i, _k in zip(_is, ks)]
10
- max_I, max_f = max(Is), max(fs)
10
+ max_I, max_f = max(_is) + max(ks), max(fs)
11
11
  max_bw = max_I + max_f
12
12
  width_regular, width_packed = max_bw * N, sum(Is) + sum(fs)
13
13
 
@@ -32,11 +32,16 @@ def hetero_io_map(qints: list[QInterval], merge: bool = False):
32
32
  copy_from = hetero[i][0] if ks[i] else -1
33
33
  pads.append((base + max_bw - 1, base + max_bw - bias_high, copy_from))
34
34
 
35
+ mask = list(high < low for high, low in hetero)
36
+ regular = [r for r, m in zip(regular, mask) if not m]
37
+ hetero = [h for h, m in zip(hetero, mask) if not m]
38
+
35
39
  if not merge:
36
40
  return regular, hetero, pads, (width_regular, width_packed)
37
41
 
38
42
  # Merging consecutive intervals when possible
39
- for i in range(N - 2, -1, -1):
43
+ NN = len(regular) - 2
44
+ for i in range(NN, -1, -1):
40
45
  this_high = regular[i][0]
41
46
  next_low = regular[i + 1][1]
42
47
  if next_low - this_high != 1:
@@ -65,6 +70,8 @@ def generate_io_wrapper(sol: Solution | CascadedSolution, module_name: str, pipe
65
70
  _out_assignment: list[tuple[int, str]] = []
66
71
 
67
72
  for i, ((ih, jh), (ir, jr)) in enumerate(zip(het_out, reg_out)):
73
+ if ih == jh - 1:
74
+ continue
68
75
  _out_assignment.append((ih, f'assign out[{ir}:{jr}] = packed_out[{ih}:{jh}];'))
69
76
 
70
77
  for i, (i, j, copy_from) in enumerate(pad_out):
@@ -86,12 +93,12 @@ def generate_io_wrapper(sol: Solution | CascadedSolution, module_name: str, pipe
86
93
 
87
94
  module {module_name}_wrapper ({clk_and_rst_inp}
88
95
  // verilator lint_off UNUSEDSIGNAL
89
- input [{w_reg_in-1}:0] inp,
96
+ input [{w_reg_in - 1}:0] inp,
90
97
  // verilator lint_on UNUSEDSIGNAL
91
- output [{w_reg_out-1}:0] out
98
+ output [{w_reg_out - 1}:0] out
92
99
  );
93
- wire [{w_het_in-1}:0] packed_inp;
94
- wire [{w_het_out-1}:0] packed_out;
100
+ wire [{w_het_in - 1}:0] packed_inp;
101
+ wire [{w_het_out - 1}:0] packed_out;
95
102
 
96
103
  {inp_assignment_str}
97
104
 
@@ -106,150 +113,38 @@ endmodule
106
113
  """
107
114
 
108
115
 
109
- def comb_binder_gen(sol: Solution, module_name: str):
110
- k_in, i_in, f_in = zip(*map(_minimal_kif, sol.inp_qint))
111
- k_out, i_out, f_out = zip(*map(_minimal_kif, sol.out_qint))
112
- max_inp_bw = max(k + i for k, i in zip(k_in, i_in)) + max(f_in)
113
- max_out_bw = max(k + i for k, i in zip(k_out, i_out)) + max(f_out)
114
-
115
- n_in, n_out = sol.shape
116
- return f"""#include "V{module_name}.h"
117
- #include "ioutils.hh"
118
- #include <verilated.h>
119
-
120
- #ifdef _OPENMP
121
- #include <omp.h>
122
- constexpr bool _openmp = true;
123
- #else
124
- constexpr bool _openmp = false;
125
- #endif
126
-
127
- constexpr size_t N_inp = {n_in};
128
- constexpr size_t N_out = {n_out};
129
- constexpr size_t max_inp_bw = {max_inp_bw};
130
- constexpr size_t max_out_bw = {max_out_bw};
131
- typedef V{module_name} dut_t;
132
-
133
- extern "C" {{
134
-
135
- bool openmp_enabled() {{
136
- return _openmp;
137
- }}
138
-
139
- void _inference(int32_t *c_inp, int32_t *c_out, size_t n_samples) {{
140
- dut_t *dut = new dut_t;
141
-
142
- for (size_t i = 0; i < n_samples; ++i) {{
143
- write_input<N_inp, max_inp_bw>(dut->inp, &c_inp[i * N_inp]);
144
- dut->eval();
145
- read_output<N_out, max_out_bw>(dut->out, &c_out[i * N_out]);
146
- }}
147
-
148
- dut->final();
149
- delete dut;
150
- }}
151
-
152
- void inference(int32_t *c_inp, int32_t *c_out, size_t n_samples) {{
153
- size_t n_max_threads = omp_get_max_threads();
154
- size_t n_samples_per_thread = std::max<size_t>(n_samples / n_max_threads, 32);
155
- size_t n_thread = n_samples / n_samples_per_thread;
156
- n_thread += (n_samples % n_samples_per_thread) ? 1 : 0;
157
-
158
- #ifdef _OPENMP
159
- #pragma omp parallel for num_threads(n_thread) schedule(static)
160
- for (size_t i = 0; i < n_thread; ++i) {{
161
- size_t start = i * n_samples_per_thread;
162
- size_t end = std::min<size_t>(start + n_samples_per_thread, n_samples);
163
- size_t n_samples_this_thread = end - start;
164
-
165
- _inference(&c_inp[start * N_inp], &c_out[start * N_out], n_samples_this_thread);
166
- }}
167
- #else
168
- _inference(c_inp, c_out, n_samples);
169
- #endif
170
- }}
171
- }}"""
172
-
173
-
174
- def pipeline_binder_gen(csol: CascadedSolution, module_name: str, II: int = 1):
116
+ def binder_gen(csol: CascadedSolution | Solution, module_name: str, II: int = 1, latency_multiplier: int = 1):
175
117
  k_in, i_in, f_in = zip(*map(_minimal_kif, csol.inp_qint))
176
118
  k_out, i_out, f_out = zip(*map(_minimal_kif, csol.out_qint))
177
- max_inp_bw = max(k + i for k, i in zip(k_in, i_in)) + max(f_in)
178
- max_out_bw = max(k + i for k, i in zip(k_out, i_out)) + max(f_out)
179
-
180
- n_stage = len(csol.solutions)
119
+ max_inp_bw = max(k_in) + max(i_in) + max(f_in)
120
+ max_out_bw = max(k_out) + max(i_out) + max(f_out)
121
+ if isinstance(csol, Solution):
122
+ II = latency = 0
123
+ else:
124
+ latency = len(csol.solutions) * latency_multiplier
181
125
 
182
126
  n_in, n_out = csol.shape
183
- return f"""#include "V{module_name}.h"
184
- #include "ioutils.hh"
185
- #include <verilated.h>
186
-
187
- #ifdef _OPENMP
188
- #include <omp.h>
189
- constexpr bool _openmp = true;
190
- #else
191
- constexpr bool _openmp = false;
192
- #endif
193
-
194
- constexpr size_t N_inp = {n_in};
195
- constexpr size_t N_out = {n_out};
196
- constexpr size_t max_inp_bw = {max_inp_bw};
197
- constexpr size_t max_out_bw = {max_out_bw};
198
- constexpr size_t II = {II};
199
- constexpr size_t latency = {n_stage};
200
- typedef V{module_name} dut_t;
127
+ return f"""#include <cstddef>
128
+ #include "binder_util.hh"
129
+ #include "V{module_name}.h"
130
+
131
+ struct {module_name}_config {{
132
+ static const size_t N_inp = {n_in};
133
+ static const size_t N_out = {n_out};
134
+ static const size_t max_inp_bw = {max_inp_bw};
135
+ static const size_t max_out_bw = {max_out_bw};
136
+ static const size_t II = {II};
137
+ static const size_t latency = {latency};
138
+ typedef V{module_name} dut_t;
139
+ }};
201
140
 
202
141
  extern "C" {{
203
-
204
142
  bool openmp_enabled() {{
205
143
  return _openmp;
206
144
  }}
207
145
 
208
- void _inference(int32_t *c_inp, int32_t *c_out, size_t n_samples) {{
209
- dut_t *dut = new dut_t;
210
-
211
- size_t clk_req = n_samples * II + latency + 1;
212
-
213
- for (size_t t_inp = 0; t_inp < clk_req; ++t_inp) {{
214
- size_t t_out = t_inp - latency - 1;
215
-
216
- if (t_inp < n_samples * II && t_inp % II == 0) {{
217
- write_input<N_inp, max_inp_bw>(dut->inp, &c_inp[t_inp / II * N_inp]);
218
- }}
219
-
220
- dut->clk = 0;
221
- dut->eval();
222
-
223
- if (t_inp > latency && t_out % II == 0) {{
224
- read_output<N_out, max_out_bw>(dut->out, &c_out[t_out / II * N_out]);
225
- }}
226
-
227
- dut->clk = 1;
228
- dut->eval();
229
- }}
230
-
231
- dut->final();
232
- delete dut;
233
- }}
234
-
235
146
  void inference(int32_t *c_inp, int32_t *c_out, size_t n_samples) {{
236
- #ifdef _OPENMP
237
- size_t n_max_threads = omp_get_max_threads();
238
- size_t n_samples_per_thread = std::max<size_t>(n_samples / n_max_threads, 32);
239
- size_t n_thread = n_samples / n_samples_per_thread;
240
- n_thread += (n_samples % n_samples_per_thread) ? 1 : 0;
241
-
242
- #pragma omp parallel for num_threads(n_thread) schedule(static)
243
- for (size_t i = 0; i < n_thread; ++i) {{
244
- size_t start = i * n_samples_per_thread;
245
- size_t end = std::min<size_t>(start + n_samples_per_thread, n_samples);
246
- size_t n_samples_this_thread = end - start;
247
-
248
- _inference(&c_inp[start * N_inp], &c_out[start * N_out], n_samples_this_thread);
249
- }}
250
- #else
251
- _inference(c_inp, c_out, n_samples);
252
- #endif
147
+ batch_inference<{module_name}_config>(c_inp, c_out, n_samples);
253
148
  }}
254
-
255
- }}"""
149
+ }}
150
+ """
@@ -3,19 +3,37 @@ from .comb import comb_logic_gen
3
3
 
4
4
 
5
5
  def pipeline_logic_gen(
6
- csol: CascadedSolution, name: str, print_latency=False, timescale: str | None = '`timescale 1 ns / 1 ps', reset_high=True
6
+ csol: CascadedSolution,
7
+ name: str,
8
+ print_latency=False,
9
+ timescale: str | None = '`timescale 1 ns / 1 ps',
10
+ register_layers: int = 1,
7
11
  ):
8
12
  N = len(csol.solutions)
9
13
  inp_bits = [sum(map(sum, map(_minimal_kif, sol.inp_qint))) for sol in csol.solutions]
10
14
  out_bits = inp_bits[1:] + [sum(map(sum, map(_minimal_kif, csol.out_qint)))]
11
15
 
12
16
  registers = [f'reg [{width}-1:0] stage{i}_inp;' for i, width in enumerate(inp_bits)]
17
+ for i in range(0, register_layers - 1):
18
+ registers += [f'reg [{width}-1:0] stage{j}_inp_copy{i};' for j, width in enumerate(inp_bits)]
13
19
  wires = [f'wire [{width}-1:0] stage{i}_out;' for i, width in enumerate(out_bits)]
14
20
 
15
21
  comb_logic = [f'{name}_stage{i} stage{i} (.inp(stage{i}_inp), .out(stage{i}_out));' for i in range(N)]
16
22
 
17
- serial_logic = ['stage0_inp <= inp;']
18
- serial_logic += [f'stage{i}_inp <= stage{i-1}_out;' for i in range(1, N)]
23
+ if register_layers == 1:
24
+ serial_logic = ['stage0_inp <= inp;']
25
+ serial_logic += [f'stage{i}_inp <= stage{i-1}_out;' for i in range(1, N)]
26
+ else:
27
+ serial_logic = ['stage0_inp_copy0 <= inp;']
28
+ for j in range(1, register_layers - 1):
29
+ serial_logic.append(f'stage0_inp_copy{j} <= stage0_inp_copy{j-1};')
30
+ serial_logic.append(f'stage0_inp <= stage0_inp_copy{register_layers - 2};')
31
+ for i in range(1, N):
32
+ serial_logic.append(f'stage{i}_inp_copy0 <= stage{i-1}_out;')
33
+ for j in range(1, register_layers - 1):
34
+ serial_logic.append(f'stage{i}_inp_copy{j} <= stage{i}_inp_copy{j-1};')
35
+ serial_logic.append(f'stage{i}_inp <= stage{i}_inp_copy{register_layers - 2};')
36
+
19
37
  serial_logic += [f'out <= stage{N-1}_out;']
20
38
 
21
39
  sep0 = '\n '
@@ -0,0 +1,72 @@
1
+ #include "ioutil.hh"
2
+ #include <verilated.h>
3
+
4
+ #ifdef _OPENMP
5
+ #include <omp.h>
6
+ constexpr bool _openmp = true;
7
+ #else
8
+ constexpr bool _openmp = false;
9
+ #endif
10
+
11
+ template <typename CONFIG_T>
12
+ std::enable_if_t<CONFIG_T::II != 0> _inference(int32_t *c_inp, int32_t *c_out, size_t n_samples) {
13
+ typename CONFIG_T::dut_t *dut = new typename CONFIG_T::dut_t;
14
+
15
+ size_t clk_req = n_samples * CONFIG_T::II + CONFIG_T::latency + 1;
16
+
17
+ for (size_t t_inp = 0; t_inp < clk_req; ++t_inp) {
18
+ size_t t_out = t_inp - CONFIG_T::latency - 1;
19
+
20
+ if (t_inp < n_samples * CONFIG_T::II && t_inp % CONFIG_T::II == 0) {
21
+ write_input<CONFIG_T::N_inp, CONFIG_T::max_inp_bw>(dut->inp, &c_inp[t_inp / CONFIG_T::II * CONFIG_T::N_inp]);
22
+ }
23
+
24
+ dut->clk = 0;
25
+ dut->eval();
26
+
27
+ if (t_inp > CONFIG_T::latency && t_out % CONFIG_T::II == 0) {
28
+ read_output<CONFIG_T::N_out, CONFIG_T::max_out_bw>(dut->out, &c_out[t_out / CONFIG_T::II * CONFIG_T::N_out]);
29
+ }
30
+
31
+ dut->clk = 1;
32
+ dut->eval();
33
+ }
34
+
35
+ dut->final();
36
+ delete dut;
37
+ }
38
+
39
+ template <typename CONFIG_T>
40
+ std::enable_if_t<CONFIG_T::II == 0> _inference(int32_t *c_inp, int32_t *c_out, size_t n_samples) {
41
+ typename CONFIG_T::dut_t *dut = new typename CONFIG_T::dut_t;
42
+
43
+ for (size_t i = 0; i < n_samples; ++i) {
44
+ write_input<CONFIG_T::N_inp, CONFIG_T::max_inp_bw>(dut->inp, &c_inp[i * CONFIG_T::N_inp]);
45
+ dut->eval();
46
+ read_output<CONFIG_T::N_out, CONFIG_T::max_out_bw>(dut->out, &c_out[i * CONFIG_T::N_out]);
47
+ }
48
+
49
+ dut->final();
50
+ delete dut;
51
+ }
52
+
53
+ template <typename CONFIG_T> void batch_inference(int32_t *c_inp, int32_t *c_out, size_t n_samples) {
54
+ #ifdef _OPENMP
55
+ size_t n_max_threads = omp_get_max_threads();
56
+ size_t n_samples_per_thread = std::max<size_t>(n_samples / n_max_threads, 32);
57
+ size_t n_thread = n_samples / n_samples_per_thread;
58
+ n_thread += (n_samples % n_samples_per_thread) ? 1 : 0;
59
+
60
+ #pragma omp parallel for num_threads(n_thread) schedule(static)
61
+ for (size_t i = 0; i < n_thread; ++i) {
62
+ size_t start = i * n_samples_per_thread;
63
+ size_t end = std::min<size_t>(start + n_samples_per_thread, n_samples);
64
+ size_t n_samples_this_thread = end - start;
65
+ size_t offset_in = start * CONFIG_T::N_inp;
66
+ size_t offset_out = start * CONFIG_T::N_out;
67
+ _inference<CONFIG_T>(&c_inp[offset_in], &c_out[offset_out], n_samples_this_thread);
68
+ }
69
+ #else
70
+ _inference<CONFIG_T>(c_inp, c_out, n_samples);
71
+ #endif
72
+ }
@@ -26,7 +26,6 @@ file mkdir "${output_dir}/reports"
26
26
  # synth
27
27
  synth_design -top $top_module -mode out_of_context -retiming \
28
28
  -flatten_hierarchy rebuilt -resource_sharing auto \
29
- -keep_equivalent_registers -shreg_min_size 8 \
30
29
  -directive AlternateRoutability
31
30
 
32
31
  write_checkpoint -force "${output_dir}/${project_name}_post_synth.dcp"
@@ -0,0 +1,58 @@
1
+ `timescale 1ns / 1ps
2
+
3
+
4
+ module mux #(
5
+ parameter BW_INPUT0 = 32,
6
+ parameter BW_INPUT1 = 32,
7
+ parameter SIGNED0 = 0,
8
+ parameter SIGNED1 = 0,
9
+ parameter BW_OUT = 32,
10
+ parameter SHIFT1 = 0,
11
+ parameter INVERT1 = 0
12
+ ) (
13
+ input key,
14
+ input [BW_INPUT0-1:0] in0,
15
+ input [BW_INPUT1-1:0] in1,
16
+ output [BW_OUT-1:0] out
17
+ );
18
+
19
+ localparam IN0_NEED_BITS = (SHIFT1 < 0) ? BW_INPUT0 - SHIFT1 : BW_INPUT0;
20
+ localparam IN1_NEED_BITS = (SHIFT1 > 0) ? BW_INPUT1 + SHIFT1 : BW_INPUT1;
21
+ localparam EXTRA_PAD = (SIGNED0 != SIGNED1) ? INVERT1 + 1 : INVERT1 + 0;
22
+ localparam BW_BUF = (IN0_NEED_BITS > IN1_NEED_BITS) ? IN0_NEED_BITS + EXTRA_PAD : IN1_NEED_BITS + EXTRA_PAD;
23
+ localparam IN0_PAD_LEFT = (SHIFT1 < 0) ? BW_BUF - BW_INPUT0 + SHIFT1 : BW_BUF - BW_INPUT0;
24
+ localparam IN0_PAD_RIGHT = (SHIFT1 < 0) ? -SHIFT1 : 0;
25
+ localparam IN1_PAD_LEFT = (SHIFT1 > 0) ? BW_BUF - BW_INPUT1 - SHIFT1 : BW_BUF - BW_INPUT1;
26
+ localparam IN1_PAD_RIGHT = (SHIFT1 > 0) ? SHIFT1 : 0;
27
+
28
+
29
+ // verilator lint_off UNUSEDSIGNAL
30
+ wire [BW_BUF-1:0] in0_ext;
31
+ wire [BW_BUF-1:0] in1_ext;
32
+ // verilator lint_on UNUSEDSIGNAL
33
+
34
+ generate
35
+ if (SIGNED0 == 1) begin : in0_is_signed
36
+ assign in0_ext = {{IN0_PAD_LEFT{in0[BW_INPUT0-1]}}, in0, {IN0_PAD_RIGHT{1'b0}}};
37
+ end else begin : in0_is_unsigned
38
+ assign in0_ext = {{IN0_PAD_LEFT{1'b0}}, in0, {IN0_PAD_RIGHT{1'b0}}};
39
+ end
40
+ endgenerate
41
+
42
+ generate
43
+ if (SIGNED1 == 1) begin : in1_is_signed
44
+ assign in1_ext = {{IN1_PAD_LEFT{in1[BW_INPUT1-1]}}, in1, {IN1_PAD_RIGHT{1'b0}}};
45
+ end else begin : in1_is_unsigned
46
+ assign in1_ext = {{IN1_PAD_LEFT{1'b0}}, in1, {IN1_PAD_RIGHT{1'b0}}};
47
+ end
48
+ endgenerate
49
+
50
+ generate
51
+ if (INVERT1 == 1) begin : is_invert
52
+ assign out = (key) ? in0_ext[BW_OUT-1:0] : -in1_ext[BW_OUT-1:0];
53
+ end else begin : is_not_invert
54
+ assign out = (key) ? in0_ext[BW_OUT-1:0] : in1_ext[BW_OUT-1:0];
55
+ end
56
+ endgenerate
57
+
58
+ endmodule
@@ -0,0 +1,28 @@
1
+ `timescale 1ns / 1ps
2
+
3
+
4
+ module negative #(
5
+ parameter BW_IN = 32,
6
+ parameter BW_OUT = 32,
7
+ parameter IN_SIGNED = 0
8
+ ) (
9
+ // verilator lint_off UNUSEDSIGNAL
10
+ input [ BW_IN-1:0] in,
11
+ // verilator lint_off UNUSEDSIGNAL
12
+ output [BW_OUT-1:0] out
13
+ );
14
+ generate
15
+ if (BW_IN < BW_OUT) begin : in_is_smaller
16
+ wire [BW_OUT-1:0] in_ext;
17
+ if (IN_SIGNED == 1) begin : is_signed
18
+ assign in_ext = {{BW_OUT - BW_IN{in[BW_IN-1]}}, in};
19
+ end else begin : is_unsigned
20
+ assign in_ext = {{BW_OUT - BW_IN{1'b0}}, in};
21
+ end
22
+ assign out = -in_ext;
23
+ end else begin : in_is_bigger
24
+ assign out = -in[BW_OUT-1:0];
25
+ end
26
+ endgenerate
27
+
28
+ endmodule
@@ -17,7 +17,7 @@ module shift_adder #(
17
17
 
18
18
  localparam IN0_NEED_BITS = (SHIFT1 < 0) ? BW_INPUT0 - SHIFT1 : BW_INPUT0;
19
19
  localparam IN1_NEED_BITS = (SHIFT1 > 0) ? BW_INPUT1 + SHIFT1 : BW_INPUT1;
20
- localparam EXTRA_PAD = (SIGNED0 != SIGNED1) ? IS_SUB+1 : IS_SUB+0;
20
+ localparam EXTRA_PAD = (SIGNED0 != SIGNED1) ? IS_SUB + 1 : IS_SUB + 0;
21
21
  localparam BW_ADD = (IN0_NEED_BITS > IN1_NEED_BITS) ? IN0_NEED_BITS + EXTRA_PAD + 1 : IN1_NEED_BITS + EXTRA_PAD + 1;
22
22
  localparam IN0_PAD_LEFT = (SHIFT1 < 0) ? BW_ADD - BW_INPUT0 + SHIFT1 : BW_ADD - BW_INPUT0;
23
23
  localparam IN0_PAD_RIGHT = (SHIFT1 < 0) ? -SHIFT1 : 0;
@@ -37,6 +37,9 @@ module shift_adder #(
37
37
  end else begin : in0_is_unsigned
38
38
  assign in0_ext = {{IN0_PAD_LEFT{1'b0}}, in0, {IN0_PAD_RIGHT{1'b0}}};
39
39
  end
40
+ endgenerate
41
+
42
+ generate
40
43
  if (SIGNED1 == 1) begin : in1_is_signed
41
44
  assign in1_ext = {{IN1_PAD_LEFT{in1[BW_INPUT1-1]}}, in1, {IN1_PAD_RIGHT{1'b0}}};
42
45
  end else begin : in1_is_unsigned
@@ -27,3 +27,6 @@ set_clock_uncertainty -setup $uncertainty_setup [get_clocks sys_clk]
27
27
  set_clock_uncertainty -hold $uncertainty_hold [get_clocks sys_clk]
28
28
 
29
29
  set_property HD.CLK_SRC BUFG_X0Y0 [get_ports clk]
30
+
31
+ set_property retiming_forward 1 [get_cells {stage[*]_inp}]
32
+ set_property retiming_backward 1 [get_cells {stage[*]_inp}]
@@ -13,7 +13,7 @@ from numpy.typing import NDArray
13
13
  from ... import codegen
14
14
  from ...cmvm.types import CascadedSolution, Solution, _minimal_kif
15
15
  from ...trace.pipeline import to_pipeline
16
- from . import comb_binder_gen, comb_logic_gen, generate_io_wrapper, pipeline_binder_gen, pipeline_logic_gen
16
+ from . import binder_gen, comb_logic_gen, generate_io_wrapper, pipeline_logic_gen
17
17
 
18
18
 
19
19
  def get_io_kifs(sol: Solution | CascadedSolution):
@@ -34,6 +34,7 @@ class VerilogModel:
34
34
  clock_period: int = 5,
35
35
  clock_uncertainty: float = 0.1,
36
36
  io_delay_minmax: tuple[float, float] = (0.2, 0.4),
37
+ register_layers: int = 1,
37
38
  ):
38
39
  self._solution = solution
39
40
  self._path = Path(path)
@@ -45,6 +46,7 @@ class VerilogModel:
45
46
  self._clock_period = clock_period
46
47
  self._clock_uncertainty = clock_uncertainty
47
48
  self._io_delay_minmax = io_delay_minmax
49
+ self._register_layers = register_layers
48
50
 
49
51
  self._pipe = solution if isinstance(solution, CascadedSolution) else None
50
52
  if latency_cutoff > 0 and self._pipe is None:
@@ -57,12 +59,13 @@ class VerilogModel:
57
59
  self._latency_cutoff = latency_cutoff
58
60
 
59
61
  self._lib = None
62
+ self._uuid = None
60
63
 
61
64
  def write(self):
62
65
  self._path.mkdir(parents=True, exist_ok=True)
63
66
  if self._pipe is not None: # Pipeline
64
67
  # Main logic
65
- codes = pipeline_logic_gen(self._pipe, self._prj_name, self._print_latency)
68
+ codes = pipeline_logic_gen(self._pipe, self._prj_name, self._print_latency, register_layers=self._register_layers)
66
69
  for k, v in codes.items():
67
70
  with open(self._path / f'{k}.v', 'w') as f:
68
71
  f.write(v)
@@ -86,8 +89,8 @@ class VerilogModel:
86
89
  with open(self._path / f'{self._prj_name}.xdc', 'w') as f:
87
90
  f.write(xdc)
88
91
 
89
- # C++ binder w/
90
- binder = pipeline_binder_gen(self._pipe, f'{self._prj_name}_wrapper', 1)
92
+ # C++ binder w/ verilog wrapper for uniform bw
93
+ binder = binder_gen(self._pipe, f'{self._prj_name}_wrapper', 1, self._register_layers)
91
94
 
92
95
  # Verilog IO wrapper (non-uniform bw to uniform one, clk passthrough)
93
96
  io_wrapper = generate_io_wrapper(self._pipe, self._prj_name, True)
@@ -103,7 +106,7 @@ class VerilogModel:
103
106
 
104
107
  # Verilog IO wrapper (non-uniform bw to uniform one, no clk)
105
108
  io_wrapper = generate_io_wrapper(self._solution, self._prj_name, False)
106
- binder = comb_binder_gen(self._solution, f'{self._prj_name}_wrapper')
109
+ binder = binder_gen(self._solution, f'{self._prj_name}_wrapper')
107
110
 
108
111
  with open(self._path / f'{self._prj_name}_wrapper.v', 'w') as f:
109
112
  f.write(io_wrapper)
@@ -112,13 +115,16 @@ class VerilogModel:
112
115
 
113
116
  # Common resource copy
114
117
  shutil.copy(self.__src_root / 'verilog/source/shift_adder.v', self._path)
118
+ shutil.copy(self.__src_root / 'verilog/source/mux.v', self._path)
119
+ shutil.copy(self.__src_root / 'verilog/source/negative.v', self._path)
115
120
  shutil.copy(self.__src_root / 'verilog/source/build_binder.mk', self._path)
116
- shutil.copy(self.__src_root / 'verilog/source/ioutils.hh', self._path)
121
+ shutil.copy(self.__src_root / 'verilog/source/ioutil.hh', self._path)
122
+ shutil.copy(self.__src_root / 'verilog/source/binder_util.hh', self._path)
117
123
  self._solution.save(self._path / 'model.json')
118
124
  with open(self._path / 'misc.json', 'w') as f:
119
125
  f.write(f'{{"cost": {self._solution.cost}}}')
120
126
 
121
- def _compile(self, verbose=False, openmp=True, o3: bool = False, clean=True):
127
+ def _compile(self, verbose=False, openmp=True, nproc=None, o3: bool = False, clean=True):
122
128
  """Same as compile, but will not write to the library
123
129
 
124
130
  Parameters
@@ -127,6 +133,9 @@ class VerilogModel:
127
133
  Verbose output, by default False
128
134
  openmp : bool, optional
129
135
  Enable openmp, by default True
136
+ nproc : int | None, optional
137
+ Number of processes to use for compilation, by default None
138
+ If None, will use the number of CPU cores, but not more than 32.
130
139
  o3 : bool | None, optional
131
140
  Turn on -O3 flag, by default False
132
141
  clean : bool, optional
@@ -144,14 +153,20 @@ class VerilogModel:
144
153
  env['VM_PREFIX'] = f'{self._prj_name}_wrapper'
145
154
  env['STAMP'] = self._uuid
146
155
  env['EXTRA_CXXFLAGS'] = '-fopenmp' if openmp else ''
156
+ if nproc is not None:
157
+ env['N_JOBS'] = str(nproc)
147
158
  if o3:
148
159
  args.append('fast')
149
160
 
150
- if clean:
161
+ if clean is not False:
151
162
  m = re.compile(r'^lib.*[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}\.so$')
152
163
  for p in self._path.iterdir():
153
164
  if not p.is_dir() and m.match(p.name):
154
165
  p.unlink()
166
+ if clean:
167
+ subprocess.run(
168
+ ['make', '-f', 'build_binder.mk', 'clean'], env=env, cwd=self._path, check=True, capture_output=not verbose
169
+ )
155
170
 
156
171
  try:
157
172
  r = subprocess.run(args, env=env, check=True, cwd=self._path, capture_output=not verbose)
@@ -168,13 +183,19 @@ class VerilogModel:
168
183
 
169
184
  def _load_lib(self, uuid: str | None = None):
170
185
  uuid = uuid if uuid is not None else self._uuid
186
+ if uuid is None:
187
+ # load .so if there is only one, otherwise raise an error
188
+ libs = list(self._path.glob(f'lib{self._prj_name}_wrapper_*.so'))
189
+ if len(libs) == 0:
190
+ raise RuntimeError(f'Cannot load library, found {len(libs)} libraries in {self._path}')
191
+ uuid = libs[0].name.split('_')[-1].split('.', 1)[0]
171
192
  self._uuid = uuid
172
193
  lib_path = self._path / f'lib{self._prj_name}_wrapper_{uuid}.so'
173
194
  if not lib_path.exists():
174
195
  raise RuntimeError(f'Library {lib_path} does not exist')
175
196
  self._lib = ctypes.CDLL(str(lib_path))
176
197
 
177
- def compile(self, verbose=False, openmp=True, o3: bool = False):
198
+ def compile(self, verbose=False, openmp=True, nproc: int | None = None, o3: bool = False, clean=True):
178
199
  """Compile the generated code to a emulator for logic simulation.
179
200
 
180
201
  Parameters
@@ -183,8 +204,13 @@ class VerilogModel:
183
204
  Verbose output, by default False
184
205
  openmp : bool, optional
185
206
  Enable openmp, by default True
207
+ nproc : int | None, optional
208
+ Number of processes to use for compilation, by default None
209
+ If None, will use the number of CPU cores, but not more than 32.
186
210
  o3 : bool | None, optional
187
211
  Turn on -O3 flag, by default False
212
+ clean : bool, optional
213
+ Remove obsolete shared object files, by default True
188
214
 
189
215
  Raises
190
216
  ------
@@ -192,8 +218,7 @@ class VerilogModel:
192
218
  If compilation fails
193
219
  """
194
220
  self.write()
195
- self._compile(verbose=verbose, openmp=openmp, o3=o3)
196
- self._load_lib()
221
+ self._compile(verbose=verbose, openmp=openmp, nproc=nproc, o3=o3, clean=clean)
197
222
 
198
223
  def predict(self, data: NDArray[np.floating]):
199
224
  """Run the model on the input data.
@@ -225,7 +250,7 @@ class VerilogModel:
225
250
  out_data = np.empty(n_sample * out_size, dtype=np.int32)
226
251
 
227
252
  # Convert to int32 matching the LSB position
228
- inp_data[:] = data.ravel() * 2.0 ** np.max(f_in)
253
+ inp_data[:] = np.floor(data.ravel() * 2.0**f_in)
229
254
 
230
255
  inp_buf = inp_data.ctypes.data_as(ctypes.POINTER(ctypes.c_int32))
231
256
  out_buf = out_data.ctypes.data_as(ctypes.POINTER(ctypes.c_int32))
@@ -233,7 +258,7 @@ class VerilogModel:
233
258
 
234
259
  # Unscale the output int32 to recover fp values
235
260
  k, i, f = np.max(k_out), np.max(i_out), np.max(f_out)
236
- a, b, c = 2.0 ** (k + i + f), 2.0 ** (i + f), 2.0**-f
261
+ a, b, c = 2.0 ** (k + i + f), k * 2.0 ** (i + f), 2.0**-f
237
262
  return ((out_data.reshape(n_sample, out_size) + b) % a - b) * c
238
263
 
239
264
  def __repr__(self):
@@ -243,11 +268,12 @@ class VerilogModel:
243
268
  in_bits, out_bits = np.sum(kifs_in), np.sum(kifs_out)
244
269
  if self._pipe is not None:
245
270
  n_stage = len(self._pipe[0])
271
+ delay_suffix = '' if self._register_layers == 1 else f'x {self._register_layers} '
246
272
  lat_cutoff = self._latency_cutoff
247
273
  reg_bits = self._pipe.reg_bits
248
274
  spec = f"""Top Module: {self._prj_name}\n====================
249
275
  {inp_size} ({in_bits} bits) -> {out_size} ({out_bits} bits)
250
- {n_stage} stages @ max_delay={lat_cutoff}
276
+ {n_stage} {delay_suffix}stages @ max_delay={lat_cutoff}
251
277
  Estimated cost: {cost} LUTs, {reg_bits} FFs"""
252
278
 
253
279
  else:
@@ -258,7 +284,8 @@ Estimated cost: {cost} LUTs"""
258
284
 
259
285
  is_compiled = self._lib is not None
260
286
  if is_compiled:
261
- openmp = 'with OpenMP' if self._lib.openmp_enabled else '' # type: ignore
287
+ assert self._uuid is not None
288
+ openmp = 'with OpenMP' if self._lib.openmp_enabled() else '' # type: ignore
262
289
  spec += f'\nEmulator is compiled {openmp} ({self._uuid[-12:]})'
263
290
  else:
264
291
  spec += '\nEmulator is **not compiled**'
File without changes