da4ml 0.5.0__cp312-cp312-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (96) hide show
  1. da4ml/__init__.py +4 -0
  2. da4ml/_binary/__init__.py +15 -0
  3. da4ml/_binary/dais_bin.cpython-312-x86_64-linux-gnu.so +0 -0
  4. da4ml/_binary/dais_bin.pyi +5 -0
  5. da4ml/_cli/__init__.py +30 -0
  6. da4ml/_cli/convert.py +194 -0
  7. da4ml/_cli/report.py +295 -0
  8. da4ml/_version.py +32 -0
  9. da4ml/cmvm/__init__.py +4 -0
  10. da4ml/cmvm/api.py +264 -0
  11. da4ml/cmvm/core/__init__.py +221 -0
  12. da4ml/cmvm/core/indexers.py +83 -0
  13. da4ml/cmvm/core/state_opr.py +284 -0
  14. da4ml/cmvm/types.py +739 -0
  15. da4ml/cmvm/util/__init__.py +7 -0
  16. da4ml/cmvm/util/bit_decompose.py +86 -0
  17. da4ml/cmvm/util/mat_decompose.py +121 -0
  18. da4ml/codegen/__init__.py +9 -0
  19. da4ml/codegen/hls/__init__.py +4 -0
  20. da4ml/codegen/hls/hls_codegen.py +196 -0
  21. da4ml/codegen/hls/hls_model.py +255 -0
  22. da4ml/codegen/hls/source/ap_types/ap_binary.h +78 -0
  23. da4ml/codegen/hls/source/ap_types/ap_common.h +376 -0
  24. da4ml/codegen/hls/source/ap_types/ap_decl.h +212 -0
  25. da4ml/codegen/hls/source/ap_types/ap_fixed.h +360 -0
  26. da4ml/codegen/hls/source/ap_types/ap_fixed_base.h +2354 -0
  27. da4ml/codegen/hls/source/ap_types/ap_fixed_ref.h +718 -0
  28. da4ml/codegen/hls/source/ap_types/ap_fixed_special.h +230 -0
  29. da4ml/codegen/hls/source/ap_types/ap_int.h +330 -0
  30. da4ml/codegen/hls/source/ap_types/ap_int_base.h +1885 -0
  31. da4ml/codegen/hls/source/ap_types/ap_int_ref.h +1346 -0
  32. da4ml/codegen/hls/source/ap_types/ap_int_special.h +223 -0
  33. da4ml/codegen/hls/source/ap_types/ap_shift_reg.h +138 -0
  34. da4ml/codegen/hls/source/ap_types/etc/ap_private.h +7199 -0
  35. da4ml/codegen/hls/source/ap_types/hls_math.h +27 -0
  36. da4ml/codegen/hls/source/ap_types/hls_stream.h +263 -0
  37. da4ml/codegen/hls/source/ap_types/utils/x_hls_utils.h +80 -0
  38. da4ml/codegen/hls/source/binder_util.hh +71 -0
  39. da4ml/codegen/hls/source/build_binder.mk +22 -0
  40. da4ml/codegen/hls/source/vitis_bitshift.hh +32 -0
  41. da4ml/codegen/rtl/__init__.py +15 -0
  42. da4ml/codegen/rtl/common_source/binder_util.hh +99 -0
  43. da4ml/codegen/rtl/common_source/build_binder.mk +34 -0
  44. da4ml/codegen/rtl/common_source/build_quartus_prj.tcl +104 -0
  45. da4ml/codegen/rtl/common_source/build_vivado_prj.tcl +111 -0
  46. da4ml/codegen/rtl/common_source/ioutil.hh +124 -0
  47. da4ml/codegen/rtl/common_source/template.sdc +27 -0
  48. da4ml/codegen/rtl/common_source/template.xdc +30 -0
  49. da4ml/codegen/rtl/rtl_model.py +486 -0
  50. da4ml/codegen/rtl/verilog/__init__.py +10 -0
  51. da4ml/codegen/rtl/verilog/comb.py +239 -0
  52. da4ml/codegen/rtl/verilog/io_wrapper.py +113 -0
  53. da4ml/codegen/rtl/verilog/pipeline.py +67 -0
  54. da4ml/codegen/rtl/verilog/source/lookup_table.v +27 -0
  55. da4ml/codegen/rtl/verilog/source/multiplier.v +37 -0
  56. da4ml/codegen/rtl/verilog/source/mux.v +58 -0
  57. da4ml/codegen/rtl/verilog/source/negative.v +31 -0
  58. da4ml/codegen/rtl/verilog/source/shift_adder.v +59 -0
  59. da4ml/codegen/rtl/vhdl/__init__.py +9 -0
  60. da4ml/codegen/rtl/vhdl/comb.py +206 -0
  61. da4ml/codegen/rtl/vhdl/io_wrapper.py +120 -0
  62. da4ml/codegen/rtl/vhdl/pipeline.py +71 -0
  63. da4ml/codegen/rtl/vhdl/source/lookup_table.vhd +52 -0
  64. da4ml/codegen/rtl/vhdl/source/multiplier.vhd +40 -0
  65. da4ml/codegen/rtl/vhdl/source/mux.vhd +102 -0
  66. da4ml/codegen/rtl/vhdl/source/negative.vhd +35 -0
  67. da4ml/codegen/rtl/vhdl/source/shift_adder.vhd +101 -0
  68. da4ml/converter/__init__.py +63 -0
  69. da4ml/converter/hgq2/__init__.py +3 -0
  70. da4ml/converter/hgq2/layers/__init__.py +11 -0
  71. da4ml/converter/hgq2/layers/_base.py +132 -0
  72. da4ml/converter/hgq2/layers/activation.py +81 -0
  73. da4ml/converter/hgq2/layers/attn.py +148 -0
  74. da4ml/converter/hgq2/layers/batchnorm.py +15 -0
  75. da4ml/converter/hgq2/layers/conv.py +149 -0
  76. da4ml/converter/hgq2/layers/dense.py +39 -0
  77. da4ml/converter/hgq2/layers/ops.py +240 -0
  78. da4ml/converter/hgq2/layers/pool.py +107 -0
  79. da4ml/converter/hgq2/layers/table.py +176 -0
  80. da4ml/converter/hgq2/parser.py +161 -0
  81. da4ml/trace/__init__.py +6 -0
  82. da4ml/trace/fixed_variable.py +965 -0
  83. da4ml/trace/fixed_variable_array.py +600 -0
  84. da4ml/trace/ops/__init__.py +13 -0
  85. da4ml/trace/ops/einsum_utils.py +305 -0
  86. da4ml/trace/ops/quantization.py +74 -0
  87. da4ml/trace/ops/reduce_utils.py +105 -0
  88. da4ml/trace/pipeline.py +181 -0
  89. da4ml/trace/tracer.py +186 -0
  90. da4ml/typing/__init__.py +3 -0
  91. da4ml-0.5.0.dist-info/METADATA +85 -0
  92. da4ml-0.5.0.dist-info/RECORD +96 -0
  93. da4ml-0.5.0.dist-info/WHEEL +6 -0
  94. da4ml-0.5.0.dist-info/entry_points.txt +3 -0
  95. da4ml-0.5.0.dist-info/sboms/auditwheel.cdx.json +1 -0
  96. da4ml.libs/libgomp-e985bcbb.so.1.0.0 +0 -0
@@ -0,0 +1,27 @@
1
+ #ifndef X_HLS_MATH_H
2
+ #define X_HLS_MATH_H
3
+
4
+ #include <cmath>
5
+ #include "ap_fixed.h"
6
+
7
+ namespace hls {
8
+
9
+ template<class T>
10
+ static T exp(const T x) {
11
+ return (T) std::exp(x.to_double());
12
+ }
13
+
14
+ template <typename T> T sin(T x) { return (T) std::sin(x.to_double()); };
15
+
16
+ template <typename T> T cos(T x) { return (T) std::cos(x.to_double()); };
17
+
18
+ template <typename T> T asin(T x) { return (T) std::asin(x.to_double()); };
19
+
20
+ template <typename T> T acos(T x) { return (T) std::acos(x.to_double()); };
21
+
22
+ template <typename T> T atan(T x) { return (T) std::atan(x.to_double()); };
23
+
24
+ template <typename T> T atan2(T x, T y) { return (T) hls::atan2(x.to_double(), y.to_double()); };
25
+
26
+ }
27
+ #endif
@@ -0,0 +1,263 @@
1
+ /*
2
+ #- (c) Copyright 2011-2018 Xilinx, Inc. All rights reserved.
3
+ #-
4
+ #- This file contains confidential and proprietary information
5
+ #- of Xilinx, Inc. and is protected under U.S. and
6
+ #- international copyright and other intellectual property
7
+ #- laws.
8
+ #-
9
+ #- DISCLAIMER
10
+ #- This disclaimer is not a license and does not grant any
11
+ #- rights to the materials distributed herewith. Except as
12
+ #- otherwise provided in a valid license issued to you by
13
+ #- Xilinx, and to the maximum extent permitted by applicable
14
+ #- law: (1) THESE MATERIALS ARE MADE AVAILABLE "AS IS" AND
15
+ #- WITH ALL FAULTS, AND XILINX HEREBY DISCLAIMS ALL WARRANTIES
16
+ #- AND CONDITIONS, EXPRESS, IMPLIED, OR STATUTORY, INCLUDING
17
+ #- BUT NOT LIMITED TO WARRANTIES OF MERCHANTABILITY, NON-
18
+ #- INFRINGEMENT, OR FITNESS FOR ANY PARTICULAR PURPOSE; and
19
+ #- (2) Xilinx shall not be liable (whether in contract or tort,
20
+ #- including negligence, or under any other theory of
21
+ #- liability) for any loss or damage of any kind or nature
22
+ #- related to, arising under or in connection with these
23
+ #- materials, including for any direct, or any indirect,
24
+ #- special, incidental, or consequential loss or damage
25
+ #- (including loss of data, profits, goodwill, or any type of
26
+ #- loss or damage suffered as a result of any action brought
27
+ #- by a third party) even if such damage or loss was
28
+ #- reasonably foreseeable or Xilinx had been advised of the
29
+ #- possibility of the same.
30
+ #-
31
+ #- CRITICAL APPLICATIONS
32
+ #- Xilinx products are not designed or intended to be fail-
33
+ #- safe, or for use in any application requiring fail-safe
34
+ #- performance, such as life-support or safety devices or
35
+ #- systems, Class III medical devices, nuclear facilities,
36
+ #- applications related to the deployment of airbags, or any
37
+ #- other applications that could lead to death, personal
38
+ #- injury, or severe property or environmental damage
39
+ #- (individually and collectively, "Critical
40
+ #- Applications"). Customer assumes the sole risk and
41
+ #- liability of any use of Xilinx products in Critical
42
+ #- Applications, subject only to applicable laws and
43
+ #- regulations governing limitations on product liability.
44
+ #-
45
+ #- THIS COPYRIGHT NOTICE AND DISCLAIMER MUST BE RETAINED AS
46
+ #- PART OF THIS FILE AT ALL TIMES.
47
+ #- ************************************************************************
48
+
49
+
50
+ Licensed under the Apache License, Version 2.0 (the "License");
51
+ you may not use this file except in compliance with the License.
52
+ You may obtain a copy of the License at
53
+
54
+ http://www.apache.org/licenses/LICENSE-2.0
55
+
56
+ Unless required by applicable law or agreed to in writing, software
57
+ distributed under the License is distributed on an "AS IS" BASIS,
58
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
59
+ See the License for the specific language governing permissions and
60
+ limitations under the License.
61
+ */
62
+
63
+ #ifndef X_HLS_STREAM_SIM_H
64
+ #define X_HLS_STREAM_SIM_H
65
+
66
+ /*
67
+ * This file contains a C++ model of hls::stream.
68
+ * It defines C simulation model.
69
+ */
70
+ #ifndef __cplusplus
71
+
72
+ #error C++ is required to include this header file
73
+
74
+ #else
75
+
76
+ //////////////////////////////////////////////
77
+ // C level simulation models for hls::stream
78
+ //////////////////////////////////////////////
79
+ #include <queue>
80
+ #include <iostream>
81
+ #include <typeinfo>
82
+ #include <string>
83
+ #include <sstream>
84
+
85
+ #ifdef HLS_STREAM_THREAD_SAFE
86
+ #include <mutex>
87
+ #include <condition_variable>
88
+ #endif
89
+
90
+ #ifndef _MSC_VER
91
+ #include <cxxabi.h>
92
+ #include <stdlib.h>
93
+ #endif
94
+
95
+ namespace hls {
96
+
97
+ template<typename __STREAM_T__>
98
+ class stream
99
+ {
100
+ protected:
101
+ std::string _name;
102
+ std::deque<__STREAM_T__> _data; // container for the elements
103
+ #ifdef HLS_STREAM_THREAD_SAFE
104
+ std::mutex _mutex;
105
+ std::condition_variable _condition_var;
106
+ #endif
107
+
108
+ public:
109
+ /// Constructors
110
+ // Keep consistent with the synthesis model's constructors
111
+ stream() {
112
+ static unsigned _counter = 1;
113
+ std::stringstream ss;
114
+ #ifndef _MSC_VER
115
+ char* _demangle_name = abi::__cxa_demangle(typeid(*this).name(), 0, 0, 0);
116
+ if (_demangle_name) {
117
+ _name = _demangle_name;
118
+ free(_demangle_name);
119
+ }
120
+ else {
121
+ _name = "hls_stream";
122
+ }
123
+ #else
124
+ _name = typeid(*this).name();
125
+ #endif
126
+
127
+ ss << _counter++;
128
+ _name += "." + ss.str();
129
+ }
130
+
131
+ stream(const std::string name) {
132
+ // default constructor,
133
+ // capacity set to predefined maximum
134
+ _name = name;
135
+ }
136
+
137
+ /// Make copy constructor and assignment operator private
138
+ private:
139
+ stream(const stream< __STREAM_T__ >& chn):
140
+ _name(chn._name), _data(chn._data) {
141
+ }
142
+
143
+ stream& operator = (const stream< __STREAM_T__ >& chn) {
144
+ _name = chn._name;
145
+ _data = chn._data;
146
+ return *this;
147
+ }
148
+
149
+ public:
150
+ /// Overload >> and << operators to implement read() and write()
151
+ void operator >> (__STREAM_T__& rdata) {
152
+ read(rdata);
153
+ }
154
+
155
+ void operator << (const __STREAM_T__& wdata) {
156
+ write(wdata);
157
+ }
158
+
159
+
160
+ public:
161
+ /// Destructor
162
+ /// Check status of the queue
163
+ virtual ~stream() {
164
+ if (!_data.empty())
165
+ {
166
+ std::cout << "WARNING: Hls::stream '"
167
+ << _name
168
+ << "' contains leftover data,"
169
+ << " which may result in RTL simulation hanging."
170
+ << std::endl;
171
+ }
172
+ }
173
+
174
+ /// Status of the queue
175
+ bool empty() {
176
+ #ifdef HLS_STREAM_THREAD_SAFE
177
+ std::lock_guard<std::mutex> lg(_mutex);
178
+ #endif
179
+ return _data.empty();
180
+ }
181
+
182
+ bool full() const { return false; }
183
+
184
+ /// Blocking read
185
+ void read(__STREAM_T__& head) {
186
+ head = read();
187
+ }
188
+
189
+ #ifdef HLS_STREAM_THREAD_SAFE
190
+ __STREAM_T__ read() {
191
+ std::unique_lock<std::mutex> ul(_mutex);
192
+ while (_data.empty()) {
193
+ _condition_var.wait(ul);
194
+ }
195
+
196
+ __STREAM_T__ elem;
197
+ elem = _data.front();
198
+ _data.pop_front();
199
+ return elem;
200
+ }
201
+ #else
202
+ __STREAM_T__ read() {
203
+ __STREAM_T__ elem;
204
+ if (_data.empty()) {
205
+ std::cout << "WARNING: Hls::stream '"
206
+ << _name
207
+ << "' is read while empty,"
208
+ << " which may result in RTL simulation hanging."
209
+ << std::endl;
210
+ elem = __STREAM_T__();
211
+ } else {
212
+ elem = _data.front();
213
+ _data.pop_front();
214
+ }
215
+ return elem;
216
+ }
217
+ #endif
218
+
219
+ /// Blocking write
220
+ void write(const __STREAM_T__& tail) {
221
+ #ifdef HLS_STREAM_THREAD_SAFE
222
+ std::unique_lock<std::mutex> ul(_mutex);
223
+ #endif
224
+ _data.push_back(tail);
225
+ #ifdef HLS_STREAM_THREAD_SAFE
226
+ _condition_var.notify_one();
227
+ #endif
228
+ }
229
+
230
+ /// Nonblocking read
231
+ bool read_nb(__STREAM_T__& head) {
232
+ #ifdef HLS_STREAM_THREAD_SAFE
233
+ std::lock_guard<std::mutex> lg(_mutex);
234
+ #endif
235
+ bool is_empty = _data.empty();
236
+ if (is_empty) {
237
+ head = __STREAM_T__();
238
+ } else {
239
+ __STREAM_T__ elem(_data.front());
240
+ _data.pop_front();
241
+ head = elem;
242
+ }
243
+ return !is_empty;
244
+ }
245
+
246
+ /// Nonblocking write
247
+ bool write_nb(const __STREAM_T__& tail) {
248
+ bool is_full = full();
249
+ write(tail);
250
+ return !is_full;
251
+ }
252
+
253
+ /// Fifo size
254
+ size_t size() {
255
+ return _data.size();
256
+ }
257
+ };
258
+
259
+ } // namespace hls
260
+
261
+ #endif // __cplusplus
262
+ #endif // X_HLS_STREAM_SIM_H
263
+
@@ -0,0 +1,80 @@
1
+ #ifndef X_HLS_UTILS_H
2
+ #define X_HLS_UTILS_H
3
+ #include "ap_fixed.h"
4
+ #include <limits>
5
+
6
+ namespace hls {
7
+
8
+ template<typename T>
9
+ class numeric_limits {
10
+ public:
11
+ static T max() { return std::numeric_limits<T>::max(); }
12
+ static T min() { return std::numeric_limits<T>::min(); }
13
+ static T epsilon() { return std::numeric_limits<T>::epsilon(); }
14
+ };
15
+
16
+ template <int W, int I, ap_q_mode Q, ap_o_mode O>
17
+ class numeric_limits<ap_fixed<W,I,Q,O> > {
18
+ public:
19
+ static ap_fixed<W,I,Q,O> max() {
20
+ ap_int<W> m = ::hls::numeric_limits<ap_int<W> >::max();
21
+ ap_fixed<W,I,Q,O> x;
22
+ x(W-1,0) = m(W-1,0);
23
+ return x;
24
+ }
25
+ static ap_fixed<W,I,Q,O> min() {
26
+ ap_int<W> m = ::hls::numeric_limits<ap_int<W> >::min();
27
+ ap_fixed<W,I,Q,O> x;
28
+ x(W-1,0) = m(W-1,0);
29
+ return x;
30
+ }
31
+ static ap_fixed<W,I,Q,O> epsilon() {
32
+ ap_fixed<W,I,Q,O> x = 0;
33
+ x[0] = 1;
34
+ return x;
35
+ }
36
+ };
37
+
38
+ template <int W, int I, ap_q_mode Q, ap_o_mode O>
39
+ class numeric_limits<ap_ufixed<W,I,Q,O> > {
40
+ public:
41
+ static ap_ufixed<W,I,Q,O> max() {
42
+ ap_uint<W> m = ::hls::numeric_limits<ap_uint<W> >::max();
43
+ ap_ufixed<W,I,Q,O> x;
44
+ x(W-1,0) = m(W-1,0);
45
+ return x;
46
+ }
47
+ static ap_ufixed<W,I,Q,O> min() { return 0; }
48
+ static ap_ufixed<W,I,Q,O> epsilon() {
49
+ ap_ufixed<W,I,Q,O> x = 0;
50
+ x[0] = 1;
51
+ return x;
52
+ }
53
+ };
54
+
55
+ template <int W>
56
+ class numeric_limits<ap_int<W> > {
57
+ public:
58
+ static ap_int<W> max() { ap_int<W> m = min(); return ~m; }
59
+ static ap_int<W> min() { ap_int<W> m = 0; m[W-1] = 1; return m; }
60
+ static ap_int<W> epsilon() {
61
+ ap_int<W> x = 0;
62
+ x[0] = 1;
63
+ return x;
64
+ }
65
+ };
66
+
67
+ template <int W>
68
+ class numeric_limits<ap_uint<W> > {
69
+ public:
70
+ static ap_uint<W> max() { ap_uint<W> zero = 0; return ~zero; }
71
+ static ap_uint<W> min() { return 0; }
72
+ static ap_uint<W> epsilon() {
73
+ ap_uint<W> x = 0;
74
+ x[0] = 1;
75
+ return x;
76
+ }
77
+ };
78
+ }
79
+
80
+ #endif
@@ -0,0 +1,71 @@
1
+ #pragma once
2
+ #include <cstddef>
3
+
4
+ #ifdef _OPENMP
5
+ #include <algorithm>
6
+ #include <omp.h>
7
+ constexpr bool _openmp = true;
8
+ #else
9
+ constexpr bool _openmp = false;
10
+ #endif
11
+
12
+ template <typename CONFIG_T, typename T>
13
+ void _inference(T *c_inp, T *c_out, size_t n_samples) {
14
+ typename CONFIG_T::inp_t in_fixed_buf[CONFIG_T::N_inp];
15
+ typename CONFIG_T::out_t out_fixed_buf[CONFIG_T::N_out];
16
+
17
+ for (size_t i = 0; i < n_samples; ++i) {
18
+ size_t offset_in = i * CONFIG_T::N_inp;
19
+ size_t offset_out = i * CONFIG_T::N_out;
20
+ for (size_t j = 0; j < CONFIG_T::N_inp; ++j) {
21
+ in_fixed_buf[j] = c_inp[offset_in + j];
22
+ }
23
+
24
+ CONFIG_T::f(in_fixed_buf, out_fixed_buf);
25
+
26
+ for (size_t j = 0; j < CONFIG_T::N_out; ++j) {
27
+ c_out[offset_out + j] = out_fixed_buf[j];
28
+ }
29
+ }
30
+ }
31
+
32
+ template <typename CONFIG_T, typename T>
33
+ void batch_inference(T *c_inp, T *c_out, size_t n_samples, size_t n_threads) {
34
+ if (n_threads > 1 || n_threads == 0) {
35
+ #ifdef _OPENMP
36
+ size_t min_samples_per_thread;
37
+ size_t n_max_threads;
38
+ if (n_threads == 0) {
39
+ min_samples_per_thread = 1;
40
+ n_max_threads = omp_get_max_threads();
41
+ }
42
+ else {
43
+ min_samples_per_thread = std::max<size_t>(1, n_samples / n_threads);
44
+ n_max_threads = n_threads;
45
+ }
46
+ size_t n_sample_per_thread =
47
+ n_samples / n_max_threads + (n_samples % n_max_threads ? 1 : 0);
48
+ n_sample_per_thread =
49
+ std::max<size_t>(n_sample_per_thread, min_samples_per_thread);
50
+ size_t n_thread = n_samples / n_sample_per_thread;
51
+ n_thread += (n_samples % n_sample_per_thread) ? 1 : 0;
52
+
53
+ #pragma omp parallel for num_threads(n_thread) schedule(static)
54
+ for (size_t i = 0; i < n_thread; ++i) {
55
+ size_t start = i * n_sample_per_thread;
56
+ size_t end = std::min<size_t>(start + n_sample_per_thread, n_samples);
57
+ size_t n_samples_this_thread = end - start;
58
+ size_t offset_in = start * CONFIG_T::N_inp;
59
+ size_t offset_out = start * CONFIG_T::N_out;
60
+ _inference<CONFIG_T, T>(
61
+ &c_inp[offset_in], &c_out[offset_out], n_samples_this_thread
62
+ );
63
+ }
64
+ #else
65
+ _inference<CONFIG_T, T>(c_inp, c_out, n_samples);
66
+ #endif
67
+ }
68
+ else {
69
+ _inference<CONFIG_T, T>(c_inp, c_out, n_samples);
70
+ }
71
+ }
@@ -0,0 +1,22 @@
1
+ default: slow
2
+ INCLUDES = -I ap_types -I .
3
+ CXXFLAGS = -fPIC
4
+ CFLAGS = -std=c++17 -fPIC
5
+ LIBNAME = lib$(PRJ_NAME)_$(STAMP).so
6
+
7
+ fast: CXXFLAGS += -O3
8
+ fast: $(LIBNAME)
9
+
10
+ slow: CXXFLAGS += -O
11
+ slow: $(LIBNAME)
12
+
13
+ $(PRJ_NAME)_$(STAMP).o: $(PRJ_NAME).cc
14
+ $(CC) -c $(PRJ_NAME).cc -o $(PRJ_NAME)_$(STAMP).o $(INCLUDES) $(CXXFLAGS) $(EXTRA_CXXFLAGS)
15
+
16
+ $(LIBNAME): $(PRJ_NAME)_$(STAMP).o $(PRJ_NAME)_bridge.cc
17
+ $(CXX) $(INCLUDES) $(CXXFLAGS) -shared -o $@ $(PRJ_NAME)_$(STAMP).o $(PRJ_NAME)_bridge.cc $(EXTRA_CXXFLAGS)
18
+
19
+ clean:
20
+ rm -f $(LIBNAME) $(PRJ_NAME)_$(STAMP).o
21
+
22
+ .PHONY: clean
@@ -0,0 +1,32 @@
1
+ #pragma once
2
+ #include "ap_fixed.h"
3
+
4
+ template <int s, int b, int i, ap_q_mode Q, ap_o_mode O, int N>
5
+ ap_fixed<b, i + s> bit_shift(ap_fixed<b, i, Q, O, N> x) {
6
+ #pragma HLS INLINE
7
+ ap_fixed<b, i + s> r;
8
+ r.range() = x.range();
9
+ return r;
10
+ };
11
+
12
+ template <int s, int b, int i, ap_q_mode Q, ap_o_mode O, int N>
13
+ ap_ufixed<b, i + s> bit_shift(ap_ufixed<b, i, Q, O, N> x) {
14
+ #pragma HLS INLINE
15
+ ap_ufixed<b, i + s> r;
16
+ r.range() = x.range();
17
+ return r;
18
+ };
19
+
20
+ template <int s, int b> ap_fixed<b, s> bit_shift(ap_int<b> x) {
21
+ #pragma HLS INLINE
22
+ ap_fixed<b, s> r;
23
+ r.range() = x.range();
24
+ return r;
25
+ };
26
+
27
+ template <int s, int b> ap_ufixed<b, s> bit_shift(ap_uint<b> x) {
28
+ #pragma HLS INLINE
29
+ ap_ufixed<b, s> r;
30
+ r.range() = x.range();
31
+ return r;
32
+ };
@@ -0,0 +1,15 @@
1
+ from .rtl_model import RTLModel, VerilogModel, VHDLModel
2
+ from .verilog import comb_logic_gen as verilog_comb_logic_gen
3
+ from .verilog import generate_io_wrapper as verilog_generate_io_wrapper
4
+ from .vhdl import comb_logic_gen as vhdl_comb_logic_gen
5
+ from .vhdl import generate_io_wrapper as vhdl_generate_io_wrapper
6
+
7
+ __all__ = [
8
+ 'RTLModel',
9
+ 'VerilogModel',
10
+ 'VHDLModel',
11
+ 'verilog_comb_logic_gen',
12
+ 'verilog_generate_io_wrapper',
13
+ 'vhdl_comb_logic_gen',
14
+ 'vhdl_generate_io_wrapper',
15
+ ]
@@ -0,0 +1,99 @@
1
+ #include "ioutil.hh"
2
+ #include <verilated.h>
3
+
4
+ #ifdef _OPENMP
5
+ #include <omp.h>
6
+ constexpr bool _openmp = true;
7
+ #else
8
+ constexpr bool _openmp = false;
9
+ #endif
10
+
11
+ template <typename CONFIG_T>
12
+ std::enable_if_t<CONFIG_T::II != 0>
13
+ _inference(int32_t *c_inp, int32_t *c_out, size_t n_samples) {
14
+ auto dut = std::make_unique<typename CONFIG_T::dut_t>();
15
+
16
+ size_t clk_req = n_samples * CONFIG_T::II + (CONFIG_T::latency - CONFIG_T::II) + 1;
17
+
18
+ for (size_t t_inp = 0; t_inp < clk_req; ++t_inp) {
19
+ size_t t_out = t_inp - CONFIG_T::latency;
20
+
21
+ if (t_inp < n_samples * CONFIG_T::II && t_inp % CONFIG_T::II == 0) {
22
+ write_input<CONFIG_T::N_inp, CONFIG_T::max_inp_bw>(
23
+ dut->model_inp, &c_inp[t_inp / CONFIG_T::II * CONFIG_T::N_inp]
24
+ );
25
+ }
26
+
27
+ dut->clk = 0;
28
+ dut->eval();
29
+ dut->clk = 1;
30
+ dut->eval();
31
+
32
+ if (t_inp >= CONFIG_T::latency && t_out % CONFIG_T::II == 0) {
33
+ read_output<CONFIG_T::N_out, CONFIG_T::max_out_bw>(
34
+ dut->model_out, &c_out[t_out / CONFIG_T::II * CONFIG_T::N_out]
35
+ );
36
+ }
37
+ }
38
+
39
+ dut->final();
40
+ }
41
+
42
+ template <typename CONFIG_T>
43
+ std::enable_if_t<CONFIG_T::II == 0>
44
+ _inference(int32_t *c_inp, int32_t *c_out, size_t n_samples) {
45
+ auto dut = std::make_unique<typename CONFIG_T::dut_t>();
46
+
47
+ for (size_t i = 0; i < n_samples; ++i) {
48
+ write_input<CONFIG_T::N_inp, CONFIG_T::max_inp_bw>(
49
+ dut->model_inp, &c_inp[i * CONFIG_T::N_inp]
50
+ );
51
+ dut->eval();
52
+ read_output<CONFIG_T::N_out, CONFIG_T::max_out_bw>(
53
+ dut->model_out, &c_out[i * CONFIG_T::N_out]
54
+ );
55
+ }
56
+
57
+ dut->final();
58
+ }
59
+
60
+ template <typename CONFIG_T>
61
+ void batch_inference(int32_t *c_inp, int32_t *c_out, size_t n_samples, size_t n_threads) {
62
+ if (n_threads > 1 || n_threads == 0) {
63
+ #ifdef _OPENMP
64
+ size_t min_samples_per_thread;
65
+ size_t n_max_threads;
66
+ if (n_threads == 0) {
67
+ min_samples_per_thread = 1;
68
+ n_max_threads = omp_get_max_threads();
69
+ }
70
+ else {
71
+ min_samples_per_thread = std::max<size_t>(1, n_samples / n_threads);
72
+ n_max_threads = n_threads;
73
+ }
74
+ size_t n_sample_per_thread =
75
+ n_samples / n_max_threads + (n_samples % n_max_threads ? 1 : 0);
76
+ n_sample_per_thread =
77
+ std::max<size_t>(n_sample_per_thread, min_samples_per_thread);
78
+ size_t n_thread = n_samples / n_sample_per_thread;
79
+ n_thread += (n_samples % n_sample_per_thread) ? 1 : 0;
80
+
81
+ #pragma omp parallel for num_threads(n_thread) schedule(static)
82
+ for (size_t i = 0; i < n_thread; ++i) {
83
+ size_t start = i * n_sample_per_thread;
84
+ size_t end = std::min<size_t>(start + n_sample_per_thread, n_samples);
85
+ size_t n_samples_this_thread = end - start;
86
+ size_t offset_in = start * CONFIG_T::N_inp;
87
+ size_t offset_out = start * CONFIG_T::N_out;
88
+ _inference<CONFIG_T>(
89
+ &c_inp[offset_in], &c_out[offset_out], n_samples_this_thread
90
+ );
91
+ }
92
+ #else
93
+ _inference<CONFIG_T>(c_inp, c_out, n_samples);
94
+ #endif
95
+ }
96
+ else {
97
+ _inference<CONFIG_T>(c_inp, c_out, n_samples);
98
+ }
99
+ }
@@ -0,0 +1,34 @@
1
+ default: slow
2
+
3
+ VERILATOR_ROOT = $(shell verilator -V | grep -a VERILATOR_ROOT | tail -1 | awk '{{print $$3}}')
4
+ INCLUDES = -I./obj_dir -I$(VERILATOR_ROOT)/include -I../src
5
+ WARNINGS = -Wl,--no-undefined
6
+ CFLAGS = -std=c++17 -fPIC
7
+ LINKFLAGS = $(INCLUDES) $(WARNINGS)
8
+ LIBNAME = lib$(VM_PREFIX)_$(STAMP).so
9
+ N_JOBS ?= $(shell nproc)
10
+ VERILATOR_FLAGS ?=
11
+
12
+ ../src/$(VM_PREFIX).v: $(wildcard ../src/$(VM_PREFIX).vhd) $(wildcard ../src/$(VM_PREFIX)_stage*.vhd)
13
+ # vhdl specific - convert to verilog first for verilating
14
+ mkdir -p obj_dir
15
+ cp ../src/memfiles/* ./ 2>/dev/null || true
16
+ ghdl -a --std=08 --workdir=obj_dir ../src/static/multiplier.vhd ../src/static/mux.vhd ../src/static/negative.vhd ../src/static/shift_adder.vhd ../src/static/lookup_table.vhd $(wildcard ../src/$(VM_PREFIX:_wrapper=)_stage*.vhd) $(wildcard ../src/$(VM_PREFIX:_wrapper=).vhd) ../src/$(VM_PREFIX).vhd
17
+ ghdl synth --std=08 --workdir=obj_dir --out=verilog $(VM_PREFIX) > $(VM_PREFIX).v
18
+
19
+ ./obj_dir/libV$(VM_PREFIX).a ./obj_dir/libverilated.a ./obj_dir/V$(VM_PREFIX)__ALL.a: ../src/$(VM_PREFIX).v $(wildcard ../src/$(VM_PREFIX)_stage*.v)
20
+ verilator --cc -j $(N_JOBS) -build $(VM_PREFIX).v --prefix V$(VM_PREFIX) $(VERILATOR_FLAGS) -CFLAGS "$(CFLAGS)" -I../src -I../src/static
21
+
22
+ $(LIBNAME): ./obj_dir/libV$(VM_PREFIX).a ./obj_dir/libverilated.a ./obj_dir/V$(VM_PREFIX)__ALL.a $(VM_PREFIX)_binder.cc
23
+ $(CXX) $(CFLAGS) $(LINKFLAGS) $(CXXFLAGS2) -pthread -shared -o $(LIBNAME) $(VM_PREFIX)_binder.cc ./obj_dir/libV$(VM_PREFIX).a ./obj_dir/libverilated.a ./obj_dir/V$(VM_PREFIX)__ALL.a $(EXTRA_CXXFLAGS)
24
+
25
+
26
+ fast: CFLAGS += -O3
27
+ fast: $(LIBNAME)
28
+
29
+ slow: CFLAGS += -O
30
+ slow: $(LIBNAME)
31
+
32
+ clean:
33
+ rm -rf obj_dir
34
+ rm -f $(LIBNAME)