da4ml 0.2.1__py3-none-any.whl → 0.3.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of da4ml might be problematic. Click here for more details.

Files changed (55) hide show
  1. da4ml/_version.py +2 -2
  2. da4ml/cmvm/types.py +95 -15
  3. da4ml/codegen/__init__.py +5 -4
  4. da4ml/codegen/cpp/__init__.py +2 -1
  5. da4ml/codegen/cpp/cpp_codegen.py +56 -23
  6. da4ml/codegen/cpp/hls_model.py +252 -0
  7. da4ml/codegen/cpp/source/ap_types/ap_binary.h +78 -0
  8. da4ml/codegen/cpp/source/ap_types/ap_common.h +376 -0
  9. da4ml/codegen/cpp/source/ap_types/ap_decl.h +212 -0
  10. da4ml/codegen/cpp/source/ap_types/ap_fixed.h +360 -0
  11. da4ml/codegen/cpp/source/ap_types/ap_fixed_base.h +2354 -0
  12. da4ml/codegen/cpp/source/ap_types/ap_fixed_ref.h +718 -0
  13. da4ml/codegen/cpp/source/ap_types/ap_fixed_special.h +230 -0
  14. da4ml/codegen/cpp/source/ap_types/ap_int.h +330 -0
  15. da4ml/codegen/cpp/source/ap_types/ap_int_base.h +1885 -0
  16. da4ml/codegen/cpp/source/ap_types/ap_int_ref.h +1346 -0
  17. da4ml/codegen/cpp/source/ap_types/ap_int_special.h +223 -0
  18. da4ml/codegen/cpp/source/ap_types/ap_shift_reg.h +138 -0
  19. da4ml/codegen/cpp/source/ap_types/etc/ap_private.h +7199 -0
  20. da4ml/codegen/cpp/source/ap_types/hls_math.h +27 -0
  21. da4ml/codegen/cpp/source/ap_types/hls_stream.h +263 -0
  22. da4ml/codegen/cpp/source/ap_types/utils/x_hls_utils.h +80 -0
  23. da4ml/codegen/cpp/source/binder_util.hh +56 -0
  24. da4ml/codegen/cpp/source/build_binder.mk +24 -0
  25. da4ml/codegen/cpp/source/{vitis.h → vitis_bitshift.hh} +1 -1
  26. da4ml/codegen/verilog/__init__.py +2 -3
  27. da4ml/codegen/verilog/comb.py +65 -24
  28. da4ml/codegen/verilog/io_wrapper.py +36 -141
  29. da4ml/codegen/verilog/source/binder_util.hh +72 -0
  30. da4ml/codegen/verilog/source/mux.v +58 -0
  31. da4ml/codegen/verilog/source/negative.v +28 -0
  32. da4ml/codegen/verilog/source/shift_adder.v +4 -1
  33. da4ml/codegen/verilog/source/template.xdc +3 -0
  34. da4ml/codegen/verilog/verilog_model.py +36 -12
  35. da4ml/converter/__init__.py +0 -0
  36. da4ml/converter/hgq2/parser.py +105 -0
  37. da4ml/converter/hgq2/replica.py +383 -0
  38. da4ml/trace/__init__.py +2 -2
  39. da4ml/trace/fixed_variable.py +175 -16
  40. da4ml/trace/fixed_variable_array.py +109 -4
  41. da4ml/trace/ops/__init__.py +22 -6
  42. da4ml/trace/ops/conv_utils.py +146 -14
  43. da4ml/trace/ops/einsum_utils.py +9 -6
  44. da4ml/trace/ops/reduce_utils.py +103 -0
  45. da4ml/trace/pipeline.py +36 -34
  46. da4ml/trace/tracer.py +37 -7
  47. da4ml-0.3.0.dist-info/METADATA +107 -0
  48. da4ml-0.3.0.dist-info/RECORD +64 -0
  49. da4ml/codegen/cpp/source/vitis_bridge.h +0 -17
  50. da4ml-0.2.1.dist-info/METADATA +0 -65
  51. da4ml-0.2.1.dist-info/RECORD +0 -39
  52. /da4ml/codegen/verilog/source/{ioutils.hh → ioutil.hh} +0 -0
  53. {da4ml-0.2.1.dist-info → da4ml-0.3.0.dist-info}/WHEEL +0 -0
  54. {da4ml-0.2.1.dist-info → da4ml-0.3.0.dist-info}/licenses/LICENSE +0 -0
  55. {da4ml-0.2.1.dist-info → da4ml-0.3.0.dist-info}/top_level.txt +0 -0
@@ -7,7 +7,7 @@ def hetero_io_map(qints: list[QInterval], merge: bool = False):
7
7
  N = len(qints)
8
8
  ks, _is, fs = zip(*map(_minimal_kif, qints))
9
9
  Is = [_i + _k for _i, _k in zip(_is, ks)]
10
- max_I, max_f = max(Is), max(fs)
10
+ max_I, max_f = max(_is) + max(ks), max(fs)
11
11
  max_bw = max_I + max_f
12
12
  width_regular, width_packed = max_bw * N, sum(Is) + sum(fs)
13
13
 
@@ -32,11 +32,16 @@ def hetero_io_map(qints: list[QInterval], merge: bool = False):
32
32
  copy_from = hetero[i][0] if ks[i] else -1
33
33
  pads.append((base + max_bw - 1, base + max_bw - bias_high, copy_from))
34
34
 
35
+ mask = list(high < low for high, low in hetero)
36
+ regular = [r for r, m in zip(regular, mask) if not m]
37
+ hetero = [h for h, m in zip(hetero, mask) if not m]
38
+
35
39
  if not merge:
36
40
  return regular, hetero, pads, (width_regular, width_packed)
37
41
 
38
42
  # Merging consecutive intervals when possible
39
- for i in range(N - 2, -1, -1):
43
+ NN = len(regular) - 2
44
+ for i in range(NN, -1, -1):
40
45
  this_high = regular[i][0]
41
46
  next_low = regular[i + 1][1]
42
47
  if next_low - this_high != 1:
@@ -65,6 +70,8 @@ def generate_io_wrapper(sol: Solution | CascadedSolution, module_name: str, pipe
65
70
  _out_assignment: list[tuple[int, str]] = []
66
71
 
67
72
  for i, ((ih, jh), (ir, jr)) in enumerate(zip(het_out, reg_out)):
73
+ if ih == jh - 1:
74
+ continue
68
75
  _out_assignment.append((ih, f'assign out[{ir}:{jr}] = packed_out[{ih}:{jh}];'))
69
76
 
70
77
  for i, (i, j, copy_from) in enumerate(pad_out):
@@ -86,12 +93,12 @@ def generate_io_wrapper(sol: Solution | CascadedSolution, module_name: str, pipe
86
93
 
87
94
  module {module_name}_wrapper ({clk_and_rst_inp}
88
95
  // verilator lint_off UNUSEDSIGNAL
89
- input [{w_reg_in-1}:0] inp,
96
+ input [{w_reg_in - 1}:0] inp,
90
97
  // verilator lint_on UNUSEDSIGNAL
91
- output [{w_reg_out-1}:0] out
98
+ output [{w_reg_out - 1}:0] out
92
99
  );
93
- wire [{w_het_in-1}:0] packed_inp;
94
- wire [{w_het_out-1}:0] packed_out;
100
+ wire [{w_het_in - 1}:0] packed_inp;
101
+ wire [{w_het_out - 1}:0] packed_out;
95
102
 
96
103
  {inp_assignment_str}
97
104
 
@@ -106,150 +113,38 @@ endmodule
106
113
  """
107
114
 
108
115
 
109
- def comb_binder_gen(sol: Solution, module_name: str):
110
- k_in, i_in, f_in = zip(*map(_minimal_kif, sol.inp_qint))
111
- k_out, i_out, f_out = zip(*map(_minimal_kif, sol.out_qint))
112
- max_inp_bw = max(k + i for k, i in zip(k_in, i_in)) + max(f_in)
113
- max_out_bw = max(k + i for k, i in zip(k_out, i_out)) + max(f_out)
114
-
115
- n_in, n_out = sol.shape
116
- return f"""#include "V{module_name}.h"
117
- #include "ioutils.hh"
118
- #include <verilated.h>
119
-
120
- #ifdef _OPENMP
121
- #include <omp.h>
122
- constexpr bool _openmp = true;
123
- #else
124
- constexpr bool _openmp = false;
125
- #endif
126
-
127
- constexpr size_t N_inp = {n_in};
128
- constexpr size_t N_out = {n_out};
129
- constexpr size_t max_inp_bw = {max_inp_bw};
130
- constexpr size_t max_out_bw = {max_out_bw};
131
- typedef V{module_name} dut_t;
132
-
133
- extern "C" {{
134
-
135
- bool openmp_enabled() {{
136
- return _openmp;
137
- }}
138
-
139
- void _inference(int32_t *c_inp, int32_t *c_out, size_t n_samples) {{
140
- dut_t *dut = new dut_t;
141
-
142
- for (size_t i = 0; i < n_samples; ++i) {{
143
- write_input<N_inp, max_inp_bw>(dut->inp, &c_inp[i * N_inp]);
144
- dut->eval();
145
- read_output<N_out, max_out_bw>(dut->out, &c_out[i * N_out]);
146
- }}
147
-
148
- dut->final();
149
- delete dut;
150
- }}
151
-
152
- void inference(int32_t *c_inp, int32_t *c_out, size_t n_samples) {{
153
- size_t n_max_threads = omp_get_max_threads();
154
- size_t n_samples_per_thread = std::max<size_t>(n_samples / n_max_threads, 32);
155
- size_t n_thread = n_samples / n_samples_per_thread;
156
- n_thread += (n_samples % n_samples_per_thread) ? 1 : 0;
157
-
158
- #ifdef _OPENMP
159
- #pragma omp parallel for num_threads(n_thread) schedule(static)
160
- for (size_t i = 0; i < n_thread; ++i) {{
161
- size_t start = i * n_samples_per_thread;
162
- size_t end = std::min<size_t>(start + n_samples_per_thread, n_samples);
163
- size_t n_samples_this_thread = end - start;
164
-
165
- _inference(&c_inp[start * N_inp], &c_out[start * N_out], n_samples_this_thread);
166
- }}
167
- #else
168
- _inference(c_inp, c_out, n_samples);
169
- #endif
170
- }}
171
- }}"""
172
-
173
-
174
- def pipeline_binder_gen(csol: CascadedSolution, module_name: str, II: int = 1, latency_multiplier: int = 1):
116
+ def binder_gen(csol: CascadedSolution | Solution, module_name: str, II: int = 1, latency_multiplier: int = 1):
175
117
  k_in, i_in, f_in = zip(*map(_minimal_kif, csol.inp_qint))
176
118
  k_out, i_out, f_out = zip(*map(_minimal_kif, csol.out_qint))
177
- max_inp_bw = max(k + i for k, i in zip(k_in, i_in)) + max(f_in)
178
- max_out_bw = max(k + i for k, i in zip(k_out, i_out)) + max(f_out)
179
-
180
- latency = len(csol.solutions) * latency_multiplier
119
+ max_inp_bw = max(k_in) + max(i_in) + max(f_in)
120
+ max_out_bw = max(k_out) + max(i_out) + max(f_out)
121
+ if isinstance(csol, Solution):
122
+ II = latency = 0
123
+ else:
124
+ latency = len(csol.solutions) * latency_multiplier
181
125
 
182
126
  n_in, n_out = csol.shape
183
- return f"""#include "V{module_name}.h"
184
- #include "ioutils.hh"
185
- #include <verilated.h>
186
-
187
- #ifdef _OPENMP
188
- #include <omp.h>
189
- constexpr bool _openmp = true;
190
- #else
191
- constexpr bool _openmp = false;
192
- #endif
193
-
194
- constexpr size_t N_inp = {n_in};
195
- constexpr size_t N_out = {n_out};
196
- constexpr size_t max_inp_bw = {max_inp_bw};
197
- constexpr size_t max_out_bw = {max_out_bw};
198
- constexpr size_t II = {II};
199
- constexpr size_t latency = {latency};
200
- typedef V{module_name} dut_t;
127
+ return f"""#include <cstddef>
128
+ #include "binder_util.hh"
129
+ #include "V{module_name}.h"
130
+
131
+ struct {module_name}_config {{
132
+ static const size_t N_inp = {n_in};
133
+ static const size_t N_out = {n_out};
134
+ static const size_t max_inp_bw = {max_inp_bw};
135
+ static const size_t max_out_bw = {max_out_bw};
136
+ static const size_t II = {II};
137
+ static const size_t latency = {latency};
138
+ typedef V{module_name} dut_t;
139
+ }};
201
140
 
202
141
  extern "C" {{
203
-
204
142
  bool openmp_enabled() {{
205
143
  return _openmp;
206
144
  }}
207
145
 
208
- void _inference(int32_t *c_inp, int32_t *c_out, size_t n_samples) {{
209
- dut_t *dut = new dut_t;
210
-
211
- size_t clk_req = n_samples * II + latency + 1;
212
-
213
- for (size_t t_inp = 0; t_inp < clk_req; ++t_inp) {{
214
- size_t t_out = t_inp - latency - 1;
215
-
216
- if (t_inp < n_samples * II && t_inp % II == 0) {{
217
- write_input<N_inp, max_inp_bw>(dut->inp, &c_inp[t_inp / II * N_inp]);
218
- }}
219
-
220
- dut->clk = 0;
221
- dut->eval();
222
-
223
- if (t_inp > latency && t_out % II == 0) {{
224
- read_output<N_out, max_out_bw>(dut->out, &c_out[t_out / II * N_out]);
225
- }}
226
-
227
- dut->clk = 1;
228
- dut->eval();
229
- }}
230
-
231
- dut->final();
232
- delete dut;
233
- }}
234
-
235
146
  void inference(int32_t *c_inp, int32_t *c_out, size_t n_samples) {{
236
- #ifdef _OPENMP
237
- size_t n_max_threads = omp_get_max_threads();
238
- size_t n_samples_per_thread = std::max<size_t>(n_samples / n_max_threads, 32);
239
- size_t n_thread = n_samples / n_samples_per_thread;
240
- n_thread += (n_samples % n_samples_per_thread) ? 1 : 0;
241
-
242
- #pragma omp parallel for num_threads(n_thread) schedule(static)
243
- for (size_t i = 0; i < n_thread; ++i) {{
244
- size_t start = i * n_samples_per_thread;
245
- size_t end = std::min<size_t>(start + n_samples_per_thread, n_samples);
246
- size_t n_samples_this_thread = end - start;
247
-
248
- _inference(&c_inp[start * N_inp], &c_out[start * N_out], n_samples_this_thread);
249
- }}
250
- #else
251
- _inference(c_inp, c_out, n_samples);
252
- #endif
147
+ batch_inference<{module_name}_config>(c_inp, c_out, n_samples);
253
148
  }}
254
-
255
- }}"""
149
+ }}
150
+ """
@@ -0,0 +1,72 @@
1
+ #include "ioutil.hh"
2
+ #include <verilated.h>
3
+
4
+ #ifdef _OPENMP
5
+ #include <omp.h>
6
+ constexpr bool _openmp = true;
7
+ #else
8
+ constexpr bool _openmp = false;
9
+ #endif
10
+
11
+ template <typename CONFIG_T>
12
+ std::enable_if_t<CONFIG_T::II != 0> _inference(int32_t *c_inp, int32_t *c_out, size_t n_samples) {
13
+ typename CONFIG_T::dut_t *dut = new typename CONFIG_T::dut_t;
14
+
15
+ size_t clk_req = n_samples * CONFIG_T::II + CONFIG_T::latency + 1;
16
+
17
+ for (size_t t_inp = 0; t_inp < clk_req; ++t_inp) {
18
+ size_t t_out = t_inp - CONFIG_T::latency - 1;
19
+
20
+ if (t_inp < n_samples * CONFIG_T::II && t_inp % CONFIG_T::II == 0) {
21
+ write_input<CONFIG_T::N_inp, CONFIG_T::max_inp_bw>(dut->inp, &c_inp[t_inp / CONFIG_T::II * CONFIG_T::N_inp]);
22
+ }
23
+
24
+ dut->clk = 0;
25
+ dut->eval();
26
+
27
+ if (t_inp > CONFIG_T::latency && t_out % CONFIG_T::II == 0) {
28
+ read_output<CONFIG_T::N_out, CONFIG_T::max_out_bw>(dut->out, &c_out[t_out / CONFIG_T::II * CONFIG_T::N_out]);
29
+ }
30
+
31
+ dut->clk = 1;
32
+ dut->eval();
33
+ }
34
+
35
+ dut->final();
36
+ delete dut;
37
+ }
38
+
39
+ template <typename CONFIG_T>
40
+ std::enable_if_t<CONFIG_T::II == 0> _inference(int32_t *c_inp, int32_t *c_out, size_t n_samples) {
41
+ typename CONFIG_T::dut_t *dut = new typename CONFIG_T::dut_t;
42
+
43
+ for (size_t i = 0; i < n_samples; ++i) {
44
+ write_input<CONFIG_T::N_inp, CONFIG_T::max_inp_bw>(dut->inp, &c_inp[i * CONFIG_T::N_inp]);
45
+ dut->eval();
46
+ read_output<CONFIG_T::N_out, CONFIG_T::max_out_bw>(dut->out, &c_out[i * CONFIG_T::N_out]);
47
+ }
48
+
49
+ dut->final();
50
+ delete dut;
51
+ }
52
+
53
+ template <typename CONFIG_T> void batch_inference(int32_t *c_inp, int32_t *c_out, size_t n_samples) {
54
+ #ifdef _OPENMP
55
+ size_t n_max_threads = omp_get_max_threads();
56
+ size_t n_samples_per_thread = std::max<size_t>(n_samples / n_max_threads, 32);
57
+ size_t n_thread = n_samples / n_samples_per_thread;
58
+ n_thread += (n_samples % n_samples_per_thread) ? 1 : 0;
59
+
60
+ #pragma omp parallel for num_threads(n_thread) schedule(static)
61
+ for (size_t i = 0; i < n_thread; ++i) {
62
+ size_t start = i * n_samples_per_thread;
63
+ size_t end = std::min<size_t>(start + n_samples_per_thread, n_samples);
64
+ size_t n_samples_this_thread = end - start;
65
+ size_t offset_in = start * CONFIG_T::N_inp;
66
+ size_t offset_out = start * CONFIG_T::N_out;
67
+ _inference<CONFIG_T>(&c_inp[offset_in], &c_out[offset_out], n_samples_this_thread);
68
+ }
69
+ #else
70
+ _inference<CONFIG_T>(c_inp, c_out, n_samples);
71
+ #endif
72
+ }
@@ -0,0 +1,58 @@
1
+ `timescale 1ns / 1ps
2
+
3
+
4
+ module mux #(
5
+ parameter BW_INPUT0 = 32,
6
+ parameter BW_INPUT1 = 32,
7
+ parameter SIGNED0 = 0,
8
+ parameter SIGNED1 = 0,
9
+ parameter BW_OUT = 32,
10
+ parameter SHIFT1 = 0,
11
+ parameter INVERT1 = 0
12
+ ) (
13
+ input key,
14
+ input [BW_INPUT0-1:0] in0,
15
+ input [BW_INPUT1-1:0] in1,
16
+ output [BW_OUT-1:0] out
17
+ );
18
+
19
+ localparam IN0_NEED_BITS = (SHIFT1 < 0) ? BW_INPUT0 - SHIFT1 : BW_INPUT0;
20
+ localparam IN1_NEED_BITS = (SHIFT1 > 0) ? BW_INPUT1 + SHIFT1 : BW_INPUT1;
21
+ localparam EXTRA_PAD = (SIGNED0 != SIGNED1) ? INVERT1 + 1 : INVERT1 + 0;
22
+ localparam BW_BUF = (IN0_NEED_BITS > IN1_NEED_BITS) ? IN0_NEED_BITS + EXTRA_PAD : IN1_NEED_BITS + EXTRA_PAD;
23
+ localparam IN0_PAD_LEFT = (SHIFT1 < 0) ? BW_BUF - BW_INPUT0 + SHIFT1 : BW_BUF - BW_INPUT0;
24
+ localparam IN0_PAD_RIGHT = (SHIFT1 < 0) ? -SHIFT1 : 0;
25
+ localparam IN1_PAD_LEFT = (SHIFT1 > 0) ? BW_BUF - BW_INPUT1 - SHIFT1 : BW_BUF - BW_INPUT1;
26
+ localparam IN1_PAD_RIGHT = (SHIFT1 > 0) ? SHIFT1 : 0;
27
+
28
+
29
+ // verilator lint_off UNUSEDSIGNAL
30
+ wire [BW_BUF-1:0] in0_ext;
31
+ wire [BW_BUF-1:0] in1_ext;
32
+ // verilator lint_on UNUSEDSIGNAL
33
+
34
+ generate
35
+ if (SIGNED0 == 1) begin : in0_is_signed
36
+ assign in0_ext = {{IN0_PAD_LEFT{in0[BW_INPUT0-1]}}, in0, {IN0_PAD_RIGHT{1'b0}}};
37
+ end else begin : in0_is_unsigned
38
+ assign in0_ext = {{IN0_PAD_LEFT{1'b0}}, in0, {IN0_PAD_RIGHT{1'b0}}};
39
+ end
40
+ endgenerate
41
+
42
+ generate
43
+ if (SIGNED1 == 1) begin : in1_is_signed
44
+ assign in1_ext = {{IN1_PAD_LEFT{in1[BW_INPUT1-1]}}, in1, {IN1_PAD_RIGHT{1'b0}}};
45
+ end else begin : in1_is_unsigned
46
+ assign in1_ext = {{IN1_PAD_LEFT{1'b0}}, in1, {IN1_PAD_RIGHT{1'b0}}};
47
+ end
48
+ endgenerate
49
+
50
+ generate
51
+ if (INVERT1 == 1) begin : is_invert
52
+ assign out = (key) ? in0_ext[BW_OUT-1:0] : -in1_ext[BW_OUT-1:0];
53
+ end else begin : is_not_invert
54
+ assign out = (key) ? in0_ext[BW_OUT-1:0] : in1_ext[BW_OUT-1:0];
55
+ end
56
+ endgenerate
57
+
58
+ endmodule
@@ -0,0 +1,28 @@
1
+ `timescale 1ns / 1ps
2
+
3
+
4
+ module negative #(
5
+ parameter BW_IN = 32,
6
+ parameter BW_OUT = 32,
7
+ parameter IN_SIGNED = 0
8
+ ) (
9
+ // verilator lint_off UNUSEDSIGNAL
10
+ input [ BW_IN-1:0] in,
11
+ // verilator lint_off UNUSEDSIGNAL
12
+ output [BW_OUT-1:0] out
13
+ );
14
+ generate
15
+ if (BW_IN < BW_OUT) begin : in_is_smaller
16
+ wire [BW_OUT-1:0] in_ext;
17
+ if (IN_SIGNED == 1) begin : is_signed
18
+ assign in_ext = {{BW_OUT - BW_IN{in[BW_IN-1]}}, in};
19
+ end else begin : is_unsigned
20
+ assign in_ext = {{BW_OUT - BW_IN{1'b0}}, in};
21
+ end
22
+ assign out = -in_ext;
23
+ end else begin : in_is_bigger
24
+ assign out = -in[BW_OUT-1:0];
25
+ end
26
+ endgenerate
27
+
28
+ endmodule
@@ -17,7 +17,7 @@ module shift_adder #(
17
17
 
18
18
  localparam IN0_NEED_BITS = (SHIFT1 < 0) ? BW_INPUT0 - SHIFT1 : BW_INPUT0;
19
19
  localparam IN1_NEED_BITS = (SHIFT1 > 0) ? BW_INPUT1 + SHIFT1 : BW_INPUT1;
20
- localparam EXTRA_PAD = (SIGNED0 != SIGNED1) ? IS_SUB+1 : IS_SUB+0;
20
+ localparam EXTRA_PAD = (SIGNED0 != SIGNED1) ? IS_SUB + 1 : IS_SUB + 0;
21
21
  localparam BW_ADD = (IN0_NEED_BITS > IN1_NEED_BITS) ? IN0_NEED_BITS + EXTRA_PAD + 1 : IN1_NEED_BITS + EXTRA_PAD + 1;
22
22
  localparam IN0_PAD_LEFT = (SHIFT1 < 0) ? BW_ADD - BW_INPUT0 + SHIFT1 : BW_ADD - BW_INPUT0;
23
23
  localparam IN0_PAD_RIGHT = (SHIFT1 < 0) ? -SHIFT1 : 0;
@@ -37,6 +37,9 @@ module shift_adder #(
37
37
  end else begin : in0_is_unsigned
38
38
  assign in0_ext = {{IN0_PAD_LEFT{1'b0}}, in0, {IN0_PAD_RIGHT{1'b0}}};
39
39
  end
40
+ endgenerate
41
+
42
+ generate
40
43
  if (SIGNED1 == 1) begin : in1_is_signed
41
44
  assign in1_ext = {{IN1_PAD_LEFT{in1[BW_INPUT1-1]}}, in1, {IN1_PAD_RIGHT{1'b0}}};
42
45
  end else begin : in1_is_unsigned
@@ -27,3 +27,6 @@ set_clock_uncertainty -setup $uncertainty_setup [get_clocks sys_clk]
27
27
  set_clock_uncertainty -hold $uncertainty_hold [get_clocks sys_clk]
28
28
 
29
29
  set_property HD.CLK_SRC BUFG_X0Y0 [get_ports clk]
30
+
31
+ set_property retiming_forward 1 [get_cells {stage[*]_inp}]
32
+ set_property retiming_backward 1 [get_cells {stage[*]_inp}]
@@ -13,7 +13,7 @@ from numpy.typing import NDArray
13
13
  from ... import codegen
14
14
  from ...cmvm.types import CascadedSolution, Solution, _minimal_kif
15
15
  from ...trace.pipeline import to_pipeline
16
- from . import comb_binder_gen, comb_logic_gen, generate_io_wrapper, pipeline_binder_gen, pipeline_logic_gen
16
+ from . import binder_gen, comb_logic_gen, generate_io_wrapper, pipeline_logic_gen
17
17
 
18
18
 
19
19
  def get_io_kifs(sol: Solution | CascadedSolution):
@@ -59,6 +59,7 @@ class VerilogModel:
59
59
  self._latency_cutoff = latency_cutoff
60
60
 
61
61
  self._lib = None
62
+ self._uuid = None
62
63
 
63
64
  def write(self):
64
65
  self._path.mkdir(parents=True, exist_ok=True)
@@ -89,7 +90,7 @@ class VerilogModel:
89
90
  f.write(xdc)
90
91
 
91
92
  # C++ binder w/ verilog wrapper for uniform bw
92
- binder = pipeline_binder_gen(self._pipe, f'{self._prj_name}_wrapper', 1, self._register_layers)
93
+ binder = binder_gen(self._pipe, f'{self._prj_name}_wrapper', 1, self._register_layers)
93
94
 
94
95
  # Verilog IO wrapper (non-uniform bw to uniform one, clk passthrough)
95
96
  io_wrapper = generate_io_wrapper(self._pipe, self._prj_name, True)
@@ -105,7 +106,7 @@ class VerilogModel:
105
106
 
106
107
  # Verilog IO wrapper (non-uniform bw to uniform one, no clk)
107
108
  io_wrapper = generate_io_wrapper(self._solution, self._prj_name, False)
108
- binder = comb_binder_gen(self._solution, f'{self._prj_name}_wrapper')
109
+ binder = binder_gen(self._solution, f'{self._prj_name}_wrapper')
109
110
 
110
111
  with open(self._path / f'{self._prj_name}_wrapper.v', 'w') as f:
111
112
  f.write(io_wrapper)
@@ -114,13 +115,16 @@ class VerilogModel:
114
115
 
115
116
  # Common resource copy
116
117
  shutil.copy(self.__src_root / 'verilog/source/shift_adder.v', self._path)
118
+ shutil.copy(self.__src_root / 'verilog/source/mux.v', self._path)
119
+ shutil.copy(self.__src_root / 'verilog/source/negative.v', self._path)
117
120
  shutil.copy(self.__src_root / 'verilog/source/build_binder.mk', self._path)
118
- shutil.copy(self.__src_root / 'verilog/source/ioutils.hh', self._path)
121
+ shutil.copy(self.__src_root / 'verilog/source/ioutil.hh', self._path)
122
+ shutil.copy(self.__src_root / 'verilog/source/binder_util.hh', self._path)
119
123
  self._solution.save(self._path / 'model.json')
120
124
  with open(self._path / 'misc.json', 'w') as f:
121
125
  f.write(f'{{"cost": {self._solution.cost}}}')
122
126
 
123
- def _compile(self, verbose=False, openmp=True, o3: bool = False, clean=True):
127
+ def _compile(self, verbose=False, openmp=True, nproc=None, o3: bool = False, clean=True):
124
128
  """Same as compile, but will not write to the library
125
129
 
126
130
  Parameters
@@ -129,6 +133,9 @@ class VerilogModel:
129
133
  Verbose output, by default False
130
134
  openmp : bool, optional
131
135
  Enable openmp, by default True
136
+ nproc : int | None, optional
137
+ Number of processes to use for compilation, by default None
138
+ If None, will use the number of CPU cores, but not more than 32.
132
139
  o3 : bool | None, optional
133
140
  Turn on -O3 flag, by default False
134
141
  clean : bool, optional
@@ -146,14 +153,20 @@ class VerilogModel:
146
153
  env['VM_PREFIX'] = f'{self._prj_name}_wrapper'
147
154
  env['STAMP'] = self._uuid
148
155
  env['EXTRA_CXXFLAGS'] = '-fopenmp' if openmp else ''
156
+ if nproc is not None:
157
+ env['N_JOBS'] = str(nproc)
149
158
  if o3:
150
159
  args.append('fast')
151
160
 
152
- if clean:
161
+ if clean is not False:
153
162
  m = re.compile(r'^lib.*[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}\.so$')
154
163
  for p in self._path.iterdir():
155
164
  if not p.is_dir() and m.match(p.name):
156
165
  p.unlink()
166
+ if clean:
167
+ subprocess.run(
168
+ ['make', '-f', 'build_binder.mk', 'clean'], env=env, cwd=self._path, check=True, capture_output=not verbose
169
+ )
157
170
 
158
171
  try:
159
172
  r = subprocess.run(args, env=env, check=True, cwd=self._path, capture_output=not verbose)
@@ -170,13 +183,19 @@ class VerilogModel:
170
183
 
171
184
  def _load_lib(self, uuid: str | None = None):
172
185
  uuid = uuid if uuid is not None else self._uuid
186
+ if uuid is None:
187
+ # load .so if there is only one, otherwise raise an error
188
+ libs = list(self._path.glob(f'lib{self._prj_name}_wrapper_*.so'))
189
+ if len(libs) == 0:
190
+ raise RuntimeError(f'Cannot load library, found {len(libs)} libraries in {self._path}')
191
+ uuid = libs[0].name.split('_')[-1].split('.', 1)[0]
173
192
  self._uuid = uuid
174
193
  lib_path = self._path / f'lib{self._prj_name}_wrapper_{uuid}.so'
175
194
  if not lib_path.exists():
176
195
  raise RuntimeError(f'Library {lib_path} does not exist')
177
196
  self._lib = ctypes.CDLL(str(lib_path))
178
197
 
179
- def compile(self, verbose=False, openmp=True, o3: bool = False):
198
+ def compile(self, verbose=False, openmp=True, nproc: int | None = None, o3: bool = False, clean=True):
180
199
  """Compile the generated code to a emulator for logic simulation.
181
200
 
182
201
  Parameters
@@ -185,8 +204,13 @@ class VerilogModel:
185
204
  Verbose output, by default False
186
205
  openmp : bool, optional
187
206
  Enable openmp, by default True
207
+ nproc : int | None, optional
208
+ Number of processes to use for compilation, by default None
209
+ If None, will use the number of CPU cores, but not more than 32.
188
210
  o3 : bool | None, optional
189
211
  Turn on -O3 flag, by default False
212
+ clean : bool, optional
213
+ Remove obsolete shared object files, by default True
190
214
 
191
215
  Raises
192
216
  ------
@@ -194,8 +218,7 @@ class VerilogModel:
194
218
  If compilation fails
195
219
  """
196
220
  self.write()
197
- self._compile(verbose=verbose, openmp=openmp, o3=o3)
198
- self._load_lib()
221
+ self._compile(verbose=verbose, openmp=openmp, nproc=nproc, o3=o3, clean=clean)
199
222
 
200
223
  def predict(self, data: NDArray[np.floating]):
201
224
  """Run the model on the input data.
@@ -227,7 +250,7 @@ class VerilogModel:
227
250
  out_data = np.empty(n_sample * out_size, dtype=np.int32)
228
251
 
229
252
  # Convert to int32 matching the LSB position
230
- inp_data[:] = data.ravel() * 2.0 ** np.max(f_in)
253
+ inp_data[:] = np.floor(data.ravel() * 2.0**f_in)
231
254
 
232
255
  inp_buf = inp_data.ctypes.data_as(ctypes.POINTER(ctypes.c_int32))
233
256
  out_buf = out_data.ctypes.data_as(ctypes.POINTER(ctypes.c_int32))
@@ -235,7 +258,7 @@ class VerilogModel:
235
258
 
236
259
  # Unscale the output int32 to recover fp values
237
260
  k, i, f = np.max(k_out), np.max(i_out), np.max(f_out)
238
- a, b, c = 2.0 ** (k + i + f), 2.0 ** (i + f), 2.0**-f
261
+ a, b, c = 2.0 ** (k + i + f), k * 2.0 ** (i + f), 2.0**-f
239
262
  return ((out_data.reshape(n_sample, out_size) + b) % a - b) * c
240
263
 
241
264
  def __repr__(self):
@@ -261,7 +284,8 @@ Estimated cost: {cost} LUTs"""
261
284
 
262
285
  is_compiled = self._lib is not None
263
286
  if is_compiled:
264
- openmp = 'with OpenMP' if self._lib.openmp_enabled else '' # type: ignore
287
+ assert self._uuid is not None
288
+ openmp = 'with OpenMP' if self._lib.openmp_enabled() else '' # type: ignore
265
289
  spec += f'\nEmulator is compiled {openmp} ({self._uuid[-12:]})'
266
290
  else:
267
291
  spec += '\nEmulator is **not compiled**'
File without changes
@@ -0,0 +1,105 @@
1
+ from collections.abc import Sequence
2
+ from dataclasses import dataclass
3
+ from typing import Any
4
+
5
+ import keras
6
+ from keras import KerasTensor, Operation
7
+
8
+ from ...trace import FixedVariableArray, HWConfig
9
+ from ...trace.fixed_variable_array import FixedVariableArrayInput
10
+ from .replica import _registry
11
+
12
+
13
+ @dataclass
14
+ class OpObj:
15
+ operation: Operation
16
+ args: list
17
+ kwargs: dict
18
+ produces: tuple[KerasTensor, ...]
19
+ requires: tuple[KerasTensor, ...]
20
+
21
+
22
+ def parse_model(model: keras.Model):
23
+ operators: dict[int, list[OpObj]] = {}
24
+ for depth, nodes in model._nodes_by_depth.items():
25
+ _oprs = []
26
+ for node in nodes:
27
+ assert isinstance(node.operation, keras.Operation)
28
+ opr = OpObj(
29
+ operation=node.operation,
30
+ args=node.arguments.args,
31
+ kwargs=node.arguments.kwargs,
32
+ produces=node.outputs,
33
+ requires=node.arguments.keras_tensors,
34
+ )
35
+ _oprs.append(opr)
36
+ operators[depth] = _oprs
37
+ return [operators[i] for i in range(max(operators.keys()), -1, -1)]
38
+
39
+
40
+ def replace_tensors(tensor_map: dict[KerasTensor, FixedVariableArray], obj: Any) -> Any:
41
+ if isinstance(obj, KerasTensor):
42
+ return tensor_map[obj]
43
+ if isinstance(obj, list):
44
+ return [replace_tensors(tensor_map, o) for o in obj]
45
+ if isinstance(obj, tuple):
46
+ return tuple(replace_tensors(tensor_map, o) for o in obj)
47
+ if isinstance(obj, dict):
48
+ return {k: replace_tensors(tensor_map, v) for k, v in obj.items()}
49
+ return obj
50
+
51
+
52
+ def _apply_nn(
53
+ model: keras.Model, inputs: FixedVariableArray | Sequence[FixedVariableArray], verbose: bool = False
54
+ ) -> tuple[FixedVariableArray, ...]:
55
+ """
56
+ Apply a keras model to a fixed variable array or a sequence of fixed variable arrays.
57
+
58
+ Parameters
59
+ ----------
60
+ model : keras.Model
61
+ The keras model to apply.
62
+ inputs : FixedVariableArray or Sequence[FixedVariableArray]
63
+ The input fixed variable array or sequence of fixed variable arrays.
64
+
65
+ Returns
66
+ -------
67
+ tuple of FixedVariableArray
68
+ A tuple containing the output(s) of the model as FixedVariableArray.
69
+ """
70
+ if isinstance(inputs, FixedVariableArray):
71
+ inputs = (inputs,)
72
+
73
+ assert len(model.inputs) == len(inputs), f'Model has {len(model.inputs)} inputs, got {len(inputs)}'
74
+ tensor_map = {keras_tensor: da_tensor for keras_tensor, da_tensor in zip(model.inputs, inputs)}
75
+
76
+ for ops in parse_model(model):
77
+ for op in ops:
78
+ assert all(t in tensor_map for t in op.requires)
79
+ args = replace_tensors(tensor_map, op.args)
80
+ kwargs: dict[str, Any] = replace_tensors(tensor_map, op.kwargs)
81
+ if op.operation.__class__ is keras.layers.InputLayer:
82
+ continue
83
+ mirror_op = _registry[op.operation.__class__](op.operation)
84
+ if verbose:
85
+ print(f'Processing operation {op.operation.name} ({op.operation.__class__.__name__})')
86
+ outputs = mirror_op(*args, **kwargs)
87
+ for keras_tensor, da_tensor in zip(op.produces, outputs):
88
+ tensor_map[keras_tensor] = da_tensor
89
+
90
+ return tuple(tensor_map[keras_tensor] for keras_tensor in model.outputs)
91
+
92
+
93
+ def trace_model(
94
+ model: keras.Model,
95
+ hwconf: HWConfig = HWConfig(1, -1, -1),
96
+ solver_options: dict[str, Any] | None = None,
97
+ verbose: bool = False,
98
+ inputs: tuple[FixedVariableArray, ...] | None = None,
99
+ ) -> tuple[tuple[FixedVariableArray, ...], tuple[FixedVariableArray, ...]]:
100
+ if inputs is None:
101
+ inputs = tuple(
102
+ FixedVariableArrayInput(inp.shape[1:], hwconf=hwconf, solver_options=solver_options) for inp in model.inputs
103
+ )
104
+ outputs = _apply_nn(model, inputs, verbose=verbose)
105
+ return inputs, outputs