da4ml 0.5.0__cp312-cp312-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- da4ml/__init__.py +4 -0
- da4ml/_binary/__init__.py +15 -0
- da4ml/_binary/dais_bin.cpython-312-x86_64-linux-gnu.so +0 -0
- da4ml/_binary/dais_bin.pyi +5 -0
- da4ml/_cli/__init__.py +30 -0
- da4ml/_cli/convert.py +194 -0
- da4ml/_cli/report.py +295 -0
- da4ml/_version.py +32 -0
- da4ml/cmvm/__init__.py +4 -0
- da4ml/cmvm/api.py +264 -0
- da4ml/cmvm/core/__init__.py +221 -0
- da4ml/cmvm/core/indexers.py +83 -0
- da4ml/cmvm/core/state_opr.py +284 -0
- da4ml/cmvm/types.py +739 -0
- da4ml/cmvm/util/__init__.py +7 -0
- da4ml/cmvm/util/bit_decompose.py +86 -0
- da4ml/cmvm/util/mat_decompose.py +121 -0
- da4ml/codegen/__init__.py +9 -0
- da4ml/codegen/hls/__init__.py +4 -0
- da4ml/codegen/hls/hls_codegen.py +196 -0
- da4ml/codegen/hls/hls_model.py +255 -0
- da4ml/codegen/hls/source/ap_types/ap_binary.h +78 -0
- da4ml/codegen/hls/source/ap_types/ap_common.h +376 -0
- da4ml/codegen/hls/source/ap_types/ap_decl.h +212 -0
- da4ml/codegen/hls/source/ap_types/ap_fixed.h +360 -0
- da4ml/codegen/hls/source/ap_types/ap_fixed_base.h +2354 -0
- da4ml/codegen/hls/source/ap_types/ap_fixed_ref.h +718 -0
- da4ml/codegen/hls/source/ap_types/ap_fixed_special.h +230 -0
- da4ml/codegen/hls/source/ap_types/ap_int.h +330 -0
- da4ml/codegen/hls/source/ap_types/ap_int_base.h +1885 -0
- da4ml/codegen/hls/source/ap_types/ap_int_ref.h +1346 -0
- da4ml/codegen/hls/source/ap_types/ap_int_special.h +223 -0
- da4ml/codegen/hls/source/ap_types/ap_shift_reg.h +138 -0
- da4ml/codegen/hls/source/ap_types/etc/ap_private.h +7199 -0
- da4ml/codegen/hls/source/ap_types/hls_math.h +27 -0
- da4ml/codegen/hls/source/ap_types/hls_stream.h +263 -0
- da4ml/codegen/hls/source/ap_types/utils/x_hls_utils.h +80 -0
- da4ml/codegen/hls/source/binder_util.hh +71 -0
- da4ml/codegen/hls/source/build_binder.mk +22 -0
- da4ml/codegen/hls/source/vitis_bitshift.hh +32 -0
- da4ml/codegen/rtl/__init__.py +15 -0
- da4ml/codegen/rtl/common_source/binder_util.hh +99 -0
- da4ml/codegen/rtl/common_source/build_binder.mk +34 -0
- da4ml/codegen/rtl/common_source/build_quartus_prj.tcl +104 -0
- da4ml/codegen/rtl/common_source/build_vivado_prj.tcl +111 -0
- da4ml/codegen/rtl/common_source/ioutil.hh +124 -0
- da4ml/codegen/rtl/common_source/template.sdc +27 -0
- da4ml/codegen/rtl/common_source/template.xdc +30 -0
- da4ml/codegen/rtl/rtl_model.py +486 -0
- da4ml/codegen/rtl/verilog/__init__.py +10 -0
- da4ml/codegen/rtl/verilog/comb.py +239 -0
- da4ml/codegen/rtl/verilog/io_wrapper.py +113 -0
- da4ml/codegen/rtl/verilog/pipeline.py +67 -0
- da4ml/codegen/rtl/verilog/source/lookup_table.v +27 -0
- da4ml/codegen/rtl/verilog/source/multiplier.v +37 -0
- da4ml/codegen/rtl/verilog/source/mux.v +58 -0
- da4ml/codegen/rtl/verilog/source/negative.v +31 -0
- da4ml/codegen/rtl/verilog/source/shift_adder.v +59 -0
- da4ml/codegen/rtl/vhdl/__init__.py +9 -0
- da4ml/codegen/rtl/vhdl/comb.py +206 -0
- da4ml/codegen/rtl/vhdl/io_wrapper.py +120 -0
- da4ml/codegen/rtl/vhdl/pipeline.py +71 -0
- da4ml/codegen/rtl/vhdl/source/lookup_table.vhd +52 -0
- da4ml/codegen/rtl/vhdl/source/multiplier.vhd +40 -0
- da4ml/codegen/rtl/vhdl/source/mux.vhd +102 -0
- da4ml/codegen/rtl/vhdl/source/negative.vhd +35 -0
- da4ml/codegen/rtl/vhdl/source/shift_adder.vhd +101 -0
- da4ml/converter/__init__.py +63 -0
- da4ml/converter/hgq2/__init__.py +3 -0
- da4ml/converter/hgq2/layers/__init__.py +11 -0
- da4ml/converter/hgq2/layers/_base.py +132 -0
- da4ml/converter/hgq2/layers/activation.py +81 -0
- da4ml/converter/hgq2/layers/attn.py +148 -0
- da4ml/converter/hgq2/layers/batchnorm.py +15 -0
- da4ml/converter/hgq2/layers/conv.py +149 -0
- da4ml/converter/hgq2/layers/dense.py +39 -0
- da4ml/converter/hgq2/layers/ops.py +240 -0
- da4ml/converter/hgq2/layers/pool.py +107 -0
- da4ml/converter/hgq2/layers/table.py +176 -0
- da4ml/converter/hgq2/parser.py +161 -0
- da4ml/trace/__init__.py +6 -0
- da4ml/trace/fixed_variable.py +965 -0
- da4ml/trace/fixed_variable_array.py +600 -0
- da4ml/trace/ops/__init__.py +13 -0
- da4ml/trace/ops/einsum_utils.py +305 -0
- da4ml/trace/ops/quantization.py +74 -0
- da4ml/trace/ops/reduce_utils.py +105 -0
- da4ml/trace/pipeline.py +181 -0
- da4ml/trace/tracer.py +186 -0
- da4ml/typing/__init__.py +3 -0
- da4ml-0.5.0.dist-info/METADATA +85 -0
- da4ml-0.5.0.dist-info/RECORD +96 -0
- da4ml-0.5.0.dist-info/WHEEL +6 -0
- da4ml-0.5.0.dist-info/entry_points.txt +3 -0
- da4ml-0.5.0.dist-info/sboms/auditwheel.cdx.json +1 -0
- da4ml.libs/libgomp-e985bcbb.so.1.0.0 +0 -0
|
@@ -0,0 +1,486 @@
|
|
|
1
|
+
import ctypes
|
|
2
|
+
import json
|
|
3
|
+
import os
|
|
4
|
+
import re
|
|
5
|
+
import shutil
|
|
6
|
+
import subprocess
|
|
7
|
+
import sys
|
|
8
|
+
from collections.abc import Sequence
|
|
9
|
+
from pathlib import Path
|
|
10
|
+
from typing import Any
|
|
11
|
+
from uuid import uuid4
|
|
12
|
+
|
|
13
|
+
import numpy as np
|
|
14
|
+
from numpy.typing import NDArray
|
|
15
|
+
|
|
16
|
+
from ...cmvm.types import CombLogic, Pipeline, _minimal_kif
|
|
17
|
+
from ...trace.pipeline import to_pipeline
|
|
18
|
+
from .. import rtl
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
def get_io_kifs(sol: CombLogic | Pipeline):
|
|
22
|
+
inp_kifs = tuple(zip(*map(_minimal_kif, sol.inp_qint)))
|
|
23
|
+
out_kifs = tuple(zip(*map(_minimal_kif, sol.out_qint)))
|
|
24
|
+
return np.array(inp_kifs, np.int8), np.array(out_kifs, np.int8)
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
def binder_gen(csol: Pipeline | CombLogic, module_name: str, II: int = 1, latency_multiplier: int = 1):
|
|
28
|
+
k_in, i_in, f_in = zip(*map(_minimal_kif, csol.inp_qint))
|
|
29
|
+
k_out, i_out, f_out = zip(*map(_minimal_kif, csol.out_qint))
|
|
30
|
+
max_inp_bw = max(k_in) + max(i_in) + max(f_in)
|
|
31
|
+
max_out_bw = max(k_out) + max(i_out) + max(f_out)
|
|
32
|
+
if isinstance(csol, CombLogic):
|
|
33
|
+
II = latency = 0
|
|
34
|
+
else:
|
|
35
|
+
latency = len(csol.solutions) * latency_multiplier
|
|
36
|
+
|
|
37
|
+
n_in, n_out = csol.shape
|
|
38
|
+
return f"""#include <cstddef>
|
|
39
|
+
#include "binder_util.hh"
|
|
40
|
+
#include "V{module_name}.h"
|
|
41
|
+
|
|
42
|
+
struct {module_name}_config {{
|
|
43
|
+
static const size_t N_inp = {n_in};
|
|
44
|
+
static const size_t N_out = {n_out};
|
|
45
|
+
static const size_t max_inp_bw = {max_inp_bw};
|
|
46
|
+
static const size_t max_out_bw = {max_out_bw};
|
|
47
|
+
static const size_t II = {II};
|
|
48
|
+
static const size_t latency = {latency};
|
|
49
|
+
typedef V{module_name} dut_t;
|
|
50
|
+
}};
|
|
51
|
+
|
|
52
|
+
extern "C" {{
|
|
53
|
+
bool openmp_enabled() {{
|
|
54
|
+
return _openmp;
|
|
55
|
+
}}
|
|
56
|
+
|
|
57
|
+
void inference(int32_t *c_inp, int32_t *c_out, size_t n_samples, size_t n_threads) {{
|
|
58
|
+
batch_inference<{module_name}_config>(c_inp, c_out, n_samples, n_threads);
|
|
59
|
+
}}
|
|
60
|
+
}}
|
|
61
|
+
"""
|
|
62
|
+
|
|
63
|
+
|
|
64
|
+
class at_path:
|
|
65
|
+
def __init__(self, path: str | Path):
|
|
66
|
+
self._path = Path(path)
|
|
67
|
+
self._orig_cwd = None
|
|
68
|
+
|
|
69
|
+
def __enter__(self):
|
|
70
|
+
self._orig_cwd = Path.cwd()
|
|
71
|
+
os.chdir(self._path)
|
|
72
|
+
|
|
73
|
+
def __exit__(self, exc_type, exc_value, traceback):
|
|
74
|
+
os.chdir(self._orig_cwd) # type: ignore
|
|
75
|
+
|
|
76
|
+
|
|
77
|
+
class RTLModel:
|
|
78
|
+
def __init__(
|
|
79
|
+
self,
|
|
80
|
+
solution: CombLogic | Pipeline,
|
|
81
|
+
prj_name: str,
|
|
82
|
+
path: str | Path,
|
|
83
|
+
flavor: str = 'verilog',
|
|
84
|
+
latency_cutoff: float = -1,
|
|
85
|
+
print_latency: bool = True,
|
|
86
|
+
part_name: str = 'xcvu13p-flga2577-2-e',
|
|
87
|
+
clock_period: float = 5,
|
|
88
|
+
clock_uncertainty: float = 0.1,
|
|
89
|
+
io_delay_minmax: tuple[float, float] = (0.2, 0.4),
|
|
90
|
+
register_layers: int = 1,
|
|
91
|
+
):
|
|
92
|
+
self._flavor = flavor.lower()
|
|
93
|
+
self._solution = solution
|
|
94
|
+
self._path = Path(path).resolve()
|
|
95
|
+
self._prj_name = prj_name
|
|
96
|
+
self._latency_cutoff = latency_cutoff
|
|
97
|
+
self._print_latency = print_latency
|
|
98
|
+
self.__src_root = Path(rtl.__file__).parent
|
|
99
|
+
self._part_name = part_name
|
|
100
|
+
self._clock_period = clock_period
|
|
101
|
+
self._clock_uncertainty = clock_uncertainty
|
|
102
|
+
self._io_delay_minmax = io_delay_minmax
|
|
103
|
+
self._register_layers = register_layers
|
|
104
|
+
self._place_holder = False
|
|
105
|
+
|
|
106
|
+
assert self._flavor in ('vhdl', 'verilog'), f'Unsupported flavor {flavor}, only vhdl and verilog are supported.'
|
|
107
|
+
|
|
108
|
+
self._pipe = solution if isinstance(solution, Pipeline) else None
|
|
109
|
+
if latency_cutoff > 0 and self._pipe is None:
|
|
110
|
+
assert isinstance(solution, CombLogic)
|
|
111
|
+
self._pipe = to_pipeline(solution, latency_cutoff, verbose=False)
|
|
112
|
+
|
|
113
|
+
if self._pipe is not None:
|
|
114
|
+
# get actual latency cutoff
|
|
115
|
+
latency_cutoff = int(max(max(st.latency) / (i + 1) for i, st in enumerate(self._pipe.solutions)))
|
|
116
|
+
self._latency_cutoff = latency_cutoff
|
|
117
|
+
|
|
118
|
+
self._lib = None
|
|
119
|
+
self._uuid = None
|
|
120
|
+
|
|
121
|
+
def write(self, metadata: None | dict[str, Any] = None):
|
|
122
|
+
"""Write the RTL project to the specified path.
|
|
123
|
+
|
|
124
|
+
Parameters
|
|
125
|
+
----------
|
|
126
|
+
metadata : dict[str, Any] | None, optional
|
|
127
|
+
Additional metadata to write to `metadata.json`, by default None
|
|
128
|
+
"""
|
|
129
|
+
|
|
130
|
+
flavor = self._flavor
|
|
131
|
+
suffix = 'v' if flavor == 'verilog' else 'vhd'
|
|
132
|
+
if flavor == 'vhdl':
|
|
133
|
+
from .vhdl import comb_logic_gen, generate_io_wrapper, pipeline_logic_gen
|
|
134
|
+
else: # verilog
|
|
135
|
+
from .verilog import comb_logic_gen, generate_io_wrapper, pipeline_logic_gen
|
|
136
|
+
|
|
137
|
+
from .verilog.comb import table_mem_gen
|
|
138
|
+
|
|
139
|
+
(self._path / 'src/static').mkdir(parents=True, exist_ok=True)
|
|
140
|
+
(self._path / 'sim').mkdir(exist_ok=True)
|
|
141
|
+
(self._path / 'model').mkdir(exist_ok=True)
|
|
142
|
+
(self._path / 'src/memfiles').mkdir(exist_ok=True)
|
|
143
|
+
|
|
144
|
+
# Build scripts
|
|
145
|
+
for path in (self.__src_root).glob('common_source/build_*_prj.tcl'):
|
|
146
|
+
with open(path) as f:
|
|
147
|
+
tcl = f.read()
|
|
148
|
+
tcl = tcl.replace('$::env(DEVICE)', self._part_name)
|
|
149
|
+
tcl = tcl.replace('$::env(PROJECT_NAME)', self._prj_name)
|
|
150
|
+
tcl = tcl.replace('$::env(SOURCE_TYPE)', flavor)
|
|
151
|
+
with open(self._path / path.name, 'w') as f:
|
|
152
|
+
f.write(tcl)
|
|
153
|
+
|
|
154
|
+
if self._pipe is not None: # Pipeline
|
|
155
|
+
if not self._place_holder:
|
|
156
|
+
# Main logic
|
|
157
|
+
codes = pipeline_logic_gen(self._pipe, self._prj_name, self._print_latency, register_layers=self._register_layers)
|
|
158
|
+
|
|
159
|
+
# Table memory files
|
|
160
|
+
memfiles: dict[str, str] = {}
|
|
161
|
+
for comb in self._pipe.solutions:
|
|
162
|
+
memfiles.update(table_mem_gen(comb))
|
|
163
|
+
|
|
164
|
+
for k, v in codes.items():
|
|
165
|
+
with open(self._path / f'src/{k}.{suffix}', 'w') as f:
|
|
166
|
+
f.write(v)
|
|
167
|
+
else:
|
|
168
|
+
memfiles = {}
|
|
169
|
+
|
|
170
|
+
# Timing constraint
|
|
171
|
+
for fmt in ('xdc', 'sdc'):
|
|
172
|
+
with open(self.__src_root / f'common_source/template.{fmt}') as f:
|
|
173
|
+
constraint = f.read()
|
|
174
|
+
constraint = constraint.replace('$::env(CLOCK_PERIOD)', str(self._clock_period))
|
|
175
|
+
constraint = constraint.replace('$::env(UNCERTAINITY_SETUP)', str(self._clock_uncertainty))
|
|
176
|
+
constraint = constraint.replace('$::env(UNCERTAINITY_HOLD)', str(self._clock_uncertainty))
|
|
177
|
+
constraint = constraint.replace('$::env(DELAY_MAX)', str(self._io_delay_minmax[1]))
|
|
178
|
+
constraint = constraint.replace('$::env(DELAY_MIN)', str(self._io_delay_minmax[0]))
|
|
179
|
+
with open(self._path / f'src/{self._prj_name}.{fmt}', 'w') as f:
|
|
180
|
+
f.write(constraint)
|
|
181
|
+
|
|
182
|
+
# C++ binder w/ HDL wrapper for uniform bw
|
|
183
|
+
binder = binder_gen(self._pipe, f'{self._prj_name}_wrapper', 1, self._register_layers)
|
|
184
|
+
|
|
185
|
+
# Verilog IO wrapper (non-uniform bw to uniform one, clk passthrough)
|
|
186
|
+
io_wrapper = generate_io_wrapper(self._pipe, self._prj_name, True)
|
|
187
|
+
|
|
188
|
+
self._pipe.save(self._path / 'model/pipeline.json')
|
|
189
|
+
else: # Comb
|
|
190
|
+
assert isinstance(self._solution, CombLogic)
|
|
191
|
+
|
|
192
|
+
if not self._place_holder:
|
|
193
|
+
# Table memory files
|
|
194
|
+
memfiles = table_mem_gen(self._solution)
|
|
195
|
+
|
|
196
|
+
# Main logic
|
|
197
|
+
code = comb_logic_gen(self._solution, self._prj_name, self._print_latency, '`timescale 1ns/1ps')
|
|
198
|
+
with open(self._path / f'src/{self._prj_name}.{suffix}', 'w') as f:
|
|
199
|
+
f.write(code)
|
|
200
|
+
else:
|
|
201
|
+
memfiles = {}
|
|
202
|
+
|
|
203
|
+
# Verilog IO wrapper (non-uniform bw to uniform one, no clk)
|
|
204
|
+
io_wrapper = generate_io_wrapper(self._solution, self._prj_name, False)
|
|
205
|
+
binder = binder_gen(self._solution, f'{self._prj_name}_wrapper')
|
|
206
|
+
|
|
207
|
+
# Write table memory files
|
|
208
|
+
for name, mem in memfiles.items():
|
|
209
|
+
with open(self._path / 'src/memfiles' / name, 'w') as f:
|
|
210
|
+
f.write(mem)
|
|
211
|
+
|
|
212
|
+
with open(self._path / f'src/{self._prj_name}_wrapper.{suffix}', 'w') as f:
|
|
213
|
+
f.write(io_wrapper)
|
|
214
|
+
with open(self._path / f'sim/{self._prj_name}_wrapper_binder.cc', 'w') as f:
|
|
215
|
+
f.write(binder)
|
|
216
|
+
|
|
217
|
+
# Common resource copy
|
|
218
|
+
for path in self.__src_root.glob(f'{flavor}/source/*.{suffix}'):
|
|
219
|
+
shutil.copy(path, self._path / 'src/static')
|
|
220
|
+
|
|
221
|
+
shutil.copy(self.__src_root / 'common_source/build_binder.mk', self._path / 'sim')
|
|
222
|
+
shutil.copy(self.__src_root / 'common_source/ioutil.hh', self._path / 'sim')
|
|
223
|
+
shutil.copy(self.__src_root / 'common_source/binder_util.hh', self._path / 'sim')
|
|
224
|
+
self._solution.save(self._path / 'model/comb.json')
|
|
225
|
+
with open(self._path / 'metadata.json', 'w') as f:
|
|
226
|
+
_metadata = {'cost': self._solution.cost, 'flavor': self._flavor}
|
|
227
|
+
if self._pipe is not None:
|
|
228
|
+
_metadata['latency'] = len(self._pipe[0])
|
|
229
|
+
_metadata['reg_bits'] = self._pipe.reg_bits
|
|
230
|
+
|
|
231
|
+
if metadata is not None:
|
|
232
|
+
metadata.update(_metadata)
|
|
233
|
+
_metadata = metadata
|
|
234
|
+
|
|
235
|
+
f.write(json.dumps(_metadata))
|
|
236
|
+
|
|
237
|
+
def _compile(self, verbose=False, openmp=True, nproc=None, o3: bool = False, clean=True, _env: dict[str, str] | None = None):
|
|
238
|
+
"""Same as compile, but will not write to the library
|
|
239
|
+
|
|
240
|
+
Parameters
|
|
241
|
+
----------
|
|
242
|
+
verbose : bool, optional
|
|
243
|
+
Verbose output, by default False
|
|
244
|
+
openmp : bool, optional
|
|
245
|
+
Enable openmp, by default True
|
|
246
|
+
nproc : int | None, optional
|
|
247
|
+
Number of processes to use for compilation, by default None
|
|
248
|
+
If None, will use the number of CPU cores, but not more than 32.
|
|
249
|
+
o3 : bool | None, optional
|
|
250
|
+
Turn on -O3 flag, by default False
|
|
251
|
+
clean : bool, optional
|
|
252
|
+
Remove obsolete shared object files and `obj_dir`, by default True
|
|
253
|
+
|
|
254
|
+
Raises
|
|
255
|
+
------
|
|
256
|
+
RuntimeError
|
|
257
|
+
If compilation fails
|
|
258
|
+
"""
|
|
259
|
+
|
|
260
|
+
self._uuid = str(uuid4())
|
|
261
|
+
args = ['make', '-f', 'build_binder.mk']
|
|
262
|
+
env = os.environ.copy()
|
|
263
|
+
env['VM_PREFIX'] = f'{self._prj_name}_wrapper'
|
|
264
|
+
env['STAMP'] = self._uuid
|
|
265
|
+
env['EXTRA_CXXFLAGS'] = '-fopenmp' if openmp else ''
|
|
266
|
+
env['VERILATOR_FLAGS'] = '-Wall' if self._flavor == 'verilog' else ''
|
|
267
|
+
if _env is not None:
|
|
268
|
+
env.update(_env)
|
|
269
|
+
if nproc is not None:
|
|
270
|
+
env['N_JOBS'] = str(nproc)
|
|
271
|
+
if o3:
|
|
272
|
+
args.append('fast')
|
|
273
|
+
|
|
274
|
+
if clean:
|
|
275
|
+
m = re.compile(r'^lib.*[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}\.so$')
|
|
276
|
+
for p in (self._path / 'sim').iterdir():
|
|
277
|
+
if not p.is_dir() and m.match(p.name):
|
|
278
|
+
p.unlink()
|
|
279
|
+
subprocess.run(
|
|
280
|
+
['make', '-f', 'build_binder.mk', 'clean'],
|
|
281
|
+
env=env,
|
|
282
|
+
cwd=self._path / 'sim',
|
|
283
|
+
check=True,
|
|
284
|
+
capture_output=not verbose,
|
|
285
|
+
)
|
|
286
|
+
|
|
287
|
+
try:
|
|
288
|
+
r = subprocess.run(args, env=env, check=True, cwd=self._path / 'sim', capture_output=not verbose)
|
|
289
|
+
except subprocess.CalledProcessError as e:
|
|
290
|
+
print(e.stderr.decode(), file=sys.stderr)
|
|
291
|
+
print(e.stdout.decode(), file=sys.stdout)
|
|
292
|
+
raise RuntimeError('Compilation failed!!') from e
|
|
293
|
+
if r.returncode != 0:
|
|
294
|
+
print(r.stderr.decode(), file=sys.stderr)
|
|
295
|
+
print(r.stdout.decode(), file=sys.stderr)
|
|
296
|
+
raise RuntimeError('Compilation failed!!')
|
|
297
|
+
|
|
298
|
+
if clean:
|
|
299
|
+
subprocess.run(['rm', '-rf', 'obj_dir'], cwd=self._path / 'sim', check=True, capture_output=not verbose)
|
|
300
|
+
|
|
301
|
+
self._load_lib(self._uuid)
|
|
302
|
+
|
|
303
|
+
def _load_lib(self, uuid: str | None = None):
|
|
304
|
+
uuid = uuid if uuid is not None else self._uuid
|
|
305
|
+
if uuid is None:
|
|
306
|
+
# load .so if there is only one, otherwise raise an error
|
|
307
|
+
libs = list(self._path.glob(f'sim/lib{self._prj_name}_wrapper_*.so'))
|
|
308
|
+
if len(libs) == 0:
|
|
309
|
+
raise RuntimeError(f'Cannot load library, found {len(libs)} libraries in {self._path}')
|
|
310
|
+
uuid = libs[0].name.split('_')[-1].split('.', 1)[0]
|
|
311
|
+
self._uuid = uuid
|
|
312
|
+
lib_path = self._path / f'sim/lib{self._prj_name}_wrapper_{uuid}.so'
|
|
313
|
+
if not lib_path.exists():
|
|
314
|
+
raise RuntimeError(f'Library {lib_path} does not exist')
|
|
315
|
+
self._lib = ctypes.CDLL(str(lib_path))
|
|
316
|
+
|
|
317
|
+
def compile(
|
|
318
|
+
self,
|
|
319
|
+
verbose=False,
|
|
320
|
+
openmp=True,
|
|
321
|
+
nproc: int | None = None,
|
|
322
|
+
o3: bool = False,
|
|
323
|
+
clean=True,
|
|
324
|
+
metadata: None | dict[str, Any] = None,
|
|
325
|
+
):
|
|
326
|
+
"""Compile the generated code to a emulator for logic simulation.
|
|
327
|
+
|
|
328
|
+
Parameters
|
|
329
|
+
----------
|
|
330
|
+
verbose : bool, optional
|
|
331
|
+
Verbose output, by default False
|
|
332
|
+
openmp : bool, optional
|
|
333
|
+
Enable openmp, by default True
|
|
334
|
+
nproc : int | None, optional
|
|
335
|
+
Number of processes to use for compilation, by default None
|
|
336
|
+
If None, will use the number of CPU cores, but not more than 32.
|
|
337
|
+
o3 : bool | None, optional
|
|
338
|
+
Turn on -O3 flag, by default False
|
|
339
|
+
clean : bool, optional
|
|
340
|
+
Remove obsolete shared object files and `obj_dir`, by default True
|
|
341
|
+
metadata : dict[str, Any] | None, optional
|
|
342
|
+
Additional metadata to write to `metadata.json`, by default None
|
|
343
|
+
|
|
344
|
+
Raises
|
|
345
|
+
------
|
|
346
|
+
RuntimeError
|
|
347
|
+
If compilation fails
|
|
348
|
+
"""
|
|
349
|
+
|
|
350
|
+
self.write(metadata=metadata)
|
|
351
|
+
self._compile(verbose=verbose, openmp=openmp, nproc=nproc, o3=o3, clean=clean)
|
|
352
|
+
|
|
353
|
+
def predict(self, data: NDArray | Sequence[NDArray], n_threads: int = 0) -> NDArray[np.float32]:
|
|
354
|
+
"""Run the model on the input data.
|
|
355
|
+
|
|
356
|
+
Parameters
|
|
357
|
+
----------
|
|
358
|
+
data : NDArray[np.floating]|Sequence[NDArray[np.floating]]
|
|
359
|
+
Input data to the model. The shape is ignored, and the number of samples is
|
|
360
|
+
determined by the size of the data.
|
|
361
|
+
|
|
362
|
+
Returns
|
|
363
|
+
-------
|
|
364
|
+
NDArray[np.float64]
|
|
365
|
+
Output of the model in shape (n_samples, output_size).
|
|
366
|
+
"""
|
|
367
|
+
|
|
368
|
+
if isinstance(data, Sequence):
|
|
369
|
+
data = np.concatenate([a.reshape(a.shape[0], -1) for a in data], axis=-1)
|
|
370
|
+
|
|
371
|
+
assert self._lib is not None, 'Library not loaded, call .compile() first.'
|
|
372
|
+
inp_size, out_size = self._solution.shape
|
|
373
|
+
|
|
374
|
+
assert data.size % inp_size == 0, f'Input size {data.size} is not divisible by {inp_size}'
|
|
375
|
+
n_sample = data.size // inp_size
|
|
376
|
+
|
|
377
|
+
kifs_in, kifs_out = get_io_kifs(self._solution)
|
|
378
|
+
k_in, i_in, f_in = map(np.max, kifs_in)
|
|
379
|
+
k_out, i_out, f_out = map(np.max, kifs_out)
|
|
380
|
+
assert k_in + i_in + f_in <= 32, "Padded inp bw doesn't fit in int32. Emulation not supported"
|
|
381
|
+
assert k_out + i_out + f_out <= 32, "Padded out bw doesn't fit in int32. Emulation not supported"
|
|
382
|
+
|
|
383
|
+
inp_data = np.empty(n_sample * inp_size, dtype=np.int32)
|
|
384
|
+
out_data = np.empty(n_sample * out_size, dtype=np.int32)
|
|
385
|
+
|
|
386
|
+
# Convert to int32 matching the LSB position
|
|
387
|
+
inp_data[:] = np.floor(data.ravel() * 2.0**f_in)
|
|
388
|
+
|
|
389
|
+
inp_buf = inp_data.ctypes.data_as(ctypes.POINTER(ctypes.c_int32))
|
|
390
|
+
out_buf = out_data.ctypes.data_as(ctypes.POINTER(ctypes.c_int32))
|
|
391
|
+
|
|
392
|
+
with at_path(self._path / 'src/memfiles'):
|
|
393
|
+
self._lib.inference(inp_buf, out_buf, n_sample, n_threads)
|
|
394
|
+
|
|
395
|
+
# Unscale the output int32 to recover fp values
|
|
396
|
+
k, i, f = np.max(k_out), np.max(i_out), np.max(f_out)
|
|
397
|
+
a, b, c = 2.0 ** (k + i + f), k * 2.0 ** (i + f), 2.0**-f
|
|
398
|
+
return ((out_data.reshape(n_sample, out_size) + b) % a - b) * c.astype(np.float32)
|
|
399
|
+
|
|
400
|
+
def __repr__(self):
|
|
401
|
+
inp_size, out_size = self._solution.shape
|
|
402
|
+
cost = round(self._solution.cost)
|
|
403
|
+
kifs_in, kifs_out = get_io_kifs(self._solution)
|
|
404
|
+
in_bits, out_bits = np.sum(kifs_in), np.sum(kifs_out)
|
|
405
|
+
if self._pipe is not None:
|
|
406
|
+
n_stage = len(self._pipe[0])
|
|
407
|
+
delay_suffix = '' if self._register_layers == 1 else f'x {self._register_layers} '
|
|
408
|
+
lat_cutoff = self._latency_cutoff
|
|
409
|
+
reg_bits = self._pipe.reg_bits
|
|
410
|
+
spec = f"""Top Module: {self._prj_name}\n====================
|
|
411
|
+
{inp_size} ({in_bits} bits) -> {out_size} ({out_bits} bits)
|
|
412
|
+
{n_stage} {delay_suffix}stages @ max_delay={lat_cutoff}
|
|
413
|
+
Estimated cost: {cost} LUTs, {reg_bits} FFs"""
|
|
414
|
+
|
|
415
|
+
else:
|
|
416
|
+
spec = f"""Top Module: {self._prj_name}\n====================
|
|
417
|
+
{inp_size} ({in_bits} bits) -> {out_size} ({out_bits} bits)
|
|
418
|
+
combinational @ delay={self._solution.latency}
|
|
419
|
+
Estimated cost: {cost} LUTs"""
|
|
420
|
+
|
|
421
|
+
is_compiled = self._lib is not None
|
|
422
|
+
if is_compiled:
|
|
423
|
+
assert self._uuid is not None
|
|
424
|
+
openmp = 'with OpenMP' if self._lib.openmp_enabled() else '' # type: ignore
|
|
425
|
+
spec += f'\nEmulator is compiled {openmp} ({self._uuid[-12:]})'
|
|
426
|
+
else:
|
|
427
|
+
spec += '\nEmulator is **not compiled**'
|
|
428
|
+
return spec
|
|
429
|
+
|
|
430
|
+
|
|
431
|
+
class VerilogModel(RTLModel):
|
|
432
|
+
def __init__(
|
|
433
|
+
self,
|
|
434
|
+
solution: CombLogic | Pipeline,
|
|
435
|
+
prj_name: str,
|
|
436
|
+
path: str | Path,
|
|
437
|
+
latency_cutoff: float = -1,
|
|
438
|
+
print_latency: bool = True,
|
|
439
|
+
part_name: str = 'xcvu13p-flga2577-2-e',
|
|
440
|
+
clock_period: float = 5,
|
|
441
|
+
clock_uncertainty: float = 0.1,
|
|
442
|
+
io_delay_minmax: tuple[float, float] = (0.2, 0.4),
|
|
443
|
+
register_layers: int = 1,
|
|
444
|
+
):
|
|
445
|
+
self._hdl_model = super().__init__(
|
|
446
|
+
solution,
|
|
447
|
+
prj_name,
|
|
448
|
+
path,
|
|
449
|
+
'verilog',
|
|
450
|
+
latency_cutoff,
|
|
451
|
+
print_latency,
|
|
452
|
+
part_name,
|
|
453
|
+
clock_period,
|
|
454
|
+
clock_uncertainty,
|
|
455
|
+
io_delay_minmax,
|
|
456
|
+
register_layers,
|
|
457
|
+
)
|
|
458
|
+
|
|
459
|
+
|
|
460
|
+
class VHDLModel(RTLModel):
|
|
461
|
+
def __init__(
|
|
462
|
+
self,
|
|
463
|
+
solution: CombLogic | Pipeline,
|
|
464
|
+
prj_name: str,
|
|
465
|
+
path: str | Path,
|
|
466
|
+
latency_cutoff: float = -1,
|
|
467
|
+
print_latency: bool = True,
|
|
468
|
+
part_name: str = 'xcvu13p-flga2577-2-e',
|
|
469
|
+
clock_period: float = 5,
|
|
470
|
+
clock_uncertainty: float = 0.1,
|
|
471
|
+
io_delay_minmax: tuple[float, float] = (0.2, 0.4),
|
|
472
|
+
register_layers: int = 1,
|
|
473
|
+
):
|
|
474
|
+
self._hdl_model = super().__init__(
|
|
475
|
+
solution,
|
|
476
|
+
prj_name,
|
|
477
|
+
path,
|
|
478
|
+
'vhdl',
|
|
479
|
+
latency_cutoff,
|
|
480
|
+
print_latency,
|
|
481
|
+
part_name,
|
|
482
|
+
clock_period,
|
|
483
|
+
clock_uncertainty,
|
|
484
|
+
io_delay_minmax,
|
|
485
|
+
register_layers,
|
|
486
|
+
)
|