da4ml 0.5.1.post1__cp311-cp311-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (96) hide show
  1. da4ml/__init__.py +4 -0
  2. da4ml/_binary/__init__.py +15 -0
  3. da4ml/_binary/dais_bin.cpython-311-x86_64-linux-gnu.so +0 -0
  4. da4ml/_binary/dais_bin.pyi +5 -0
  5. da4ml/_cli/__init__.py +30 -0
  6. da4ml/_cli/convert.py +204 -0
  7. da4ml/_cli/report.py +295 -0
  8. da4ml/_version.py +32 -0
  9. da4ml/cmvm/__init__.py +4 -0
  10. da4ml/cmvm/api.py +264 -0
  11. da4ml/cmvm/core/__init__.py +221 -0
  12. da4ml/cmvm/core/indexers.py +83 -0
  13. da4ml/cmvm/core/state_opr.py +284 -0
  14. da4ml/cmvm/types.py +739 -0
  15. da4ml/cmvm/util/__init__.py +7 -0
  16. da4ml/cmvm/util/bit_decompose.py +86 -0
  17. da4ml/cmvm/util/mat_decompose.py +121 -0
  18. da4ml/codegen/__init__.py +9 -0
  19. da4ml/codegen/hls/__init__.py +4 -0
  20. da4ml/codegen/hls/hls_codegen.py +196 -0
  21. da4ml/codegen/hls/hls_model.py +255 -0
  22. da4ml/codegen/hls/source/ap_types/ap_binary.h +78 -0
  23. da4ml/codegen/hls/source/ap_types/ap_common.h +376 -0
  24. da4ml/codegen/hls/source/ap_types/ap_decl.h +212 -0
  25. da4ml/codegen/hls/source/ap_types/ap_fixed.h +360 -0
  26. da4ml/codegen/hls/source/ap_types/ap_fixed_base.h +2354 -0
  27. da4ml/codegen/hls/source/ap_types/ap_fixed_ref.h +718 -0
  28. da4ml/codegen/hls/source/ap_types/ap_fixed_special.h +230 -0
  29. da4ml/codegen/hls/source/ap_types/ap_int.h +330 -0
  30. da4ml/codegen/hls/source/ap_types/ap_int_base.h +1885 -0
  31. da4ml/codegen/hls/source/ap_types/ap_int_ref.h +1346 -0
  32. da4ml/codegen/hls/source/ap_types/ap_int_special.h +223 -0
  33. da4ml/codegen/hls/source/ap_types/ap_shift_reg.h +138 -0
  34. da4ml/codegen/hls/source/ap_types/etc/ap_private.h +7199 -0
  35. da4ml/codegen/hls/source/ap_types/hls_math.h +27 -0
  36. da4ml/codegen/hls/source/ap_types/hls_stream.h +263 -0
  37. da4ml/codegen/hls/source/ap_types/utils/x_hls_utils.h +80 -0
  38. da4ml/codegen/hls/source/binder_util.hh +71 -0
  39. da4ml/codegen/hls/source/build_binder.mk +22 -0
  40. da4ml/codegen/hls/source/vitis_bitshift.hh +32 -0
  41. da4ml/codegen/rtl/__init__.py +15 -0
  42. da4ml/codegen/rtl/common_source/binder_util.hh +99 -0
  43. da4ml/codegen/rtl/common_source/build_binder.mk +34 -0
  44. da4ml/codegen/rtl/common_source/build_quartus_prj.tcl +104 -0
  45. da4ml/codegen/rtl/common_source/build_vivado_prj.tcl +111 -0
  46. da4ml/codegen/rtl/common_source/ioutil.hh +124 -0
  47. da4ml/codegen/rtl/common_source/template.sdc +27 -0
  48. da4ml/codegen/rtl/common_source/template.xdc +30 -0
  49. da4ml/codegen/rtl/rtl_model.py +486 -0
  50. da4ml/codegen/rtl/verilog/__init__.py +10 -0
  51. da4ml/codegen/rtl/verilog/comb.py +239 -0
  52. da4ml/codegen/rtl/verilog/io_wrapper.py +113 -0
  53. da4ml/codegen/rtl/verilog/pipeline.py +67 -0
  54. da4ml/codegen/rtl/verilog/source/lookup_table.v +27 -0
  55. da4ml/codegen/rtl/verilog/source/multiplier.v +37 -0
  56. da4ml/codegen/rtl/verilog/source/mux.v +58 -0
  57. da4ml/codegen/rtl/verilog/source/negative.v +31 -0
  58. da4ml/codegen/rtl/verilog/source/shift_adder.v +59 -0
  59. da4ml/codegen/rtl/vhdl/__init__.py +9 -0
  60. da4ml/codegen/rtl/vhdl/comb.py +206 -0
  61. da4ml/codegen/rtl/vhdl/io_wrapper.py +120 -0
  62. da4ml/codegen/rtl/vhdl/pipeline.py +71 -0
  63. da4ml/codegen/rtl/vhdl/source/lookup_table.vhd +52 -0
  64. da4ml/codegen/rtl/vhdl/source/multiplier.vhd +40 -0
  65. da4ml/codegen/rtl/vhdl/source/mux.vhd +102 -0
  66. da4ml/codegen/rtl/vhdl/source/negative.vhd +35 -0
  67. da4ml/codegen/rtl/vhdl/source/shift_adder.vhd +101 -0
  68. da4ml/converter/__init__.py +63 -0
  69. da4ml/converter/hgq2/__init__.py +3 -0
  70. da4ml/converter/hgq2/layers/__init__.py +11 -0
  71. da4ml/converter/hgq2/layers/_base.py +132 -0
  72. da4ml/converter/hgq2/layers/activation.py +81 -0
  73. da4ml/converter/hgq2/layers/attn.py +148 -0
  74. da4ml/converter/hgq2/layers/batchnorm.py +15 -0
  75. da4ml/converter/hgq2/layers/conv.py +149 -0
  76. da4ml/converter/hgq2/layers/dense.py +39 -0
  77. da4ml/converter/hgq2/layers/ops.py +246 -0
  78. da4ml/converter/hgq2/layers/pool.py +107 -0
  79. da4ml/converter/hgq2/layers/table.py +176 -0
  80. da4ml/converter/hgq2/parser.py +161 -0
  81. da4ml/trace/__init__.py +6 -0
  82. da4ml/trace/fixed_variable.py +965 -0
  83. da4ml/trace/fixed_variable_array.py +600 -0
  84. da4ml/trace/ops/__init__.py +13 -0
  85. da4ml/trace/ops/einsum_utils.py +305 -0
  86. da4ml/trace/ops/quantization.py +74 -0
  87. da4ml/trace/ops/reduce_utils.py +105 -0
  88. da4ml/trace/pipeline.py +181 -0
  89. da4ml/trace/tracer.py +186 -0
  90. da4ml/typing/__init__.py +3 -0
  91. da4ml-0.5.1.post1.dist-info/METADATA +85 -0
  92. da4ml-0.5.1.post1.dist-info/RECORD +96 -0
  93. da4ml-0.5.1.post1.dist-info/WHEEL +6 -0
  94. da4ml-0.5.1.post1.dist-info/entry_points.txt +3 -0
  95. da4ml-0.5.1.post1.dist-info/sboms/auditwheel.cdx.json +1 -0
  96. da4ml.libs/libgomp-e985bcbb.so.1.0.0 +0 -0
@@ -0,0 +1,486 @@
1
+ import ctypes
2
+ import json
3
+ import os
4
+ import re
5
+ import shutil
6
+ import subprocess
7
+ import sys
8
+ from collections.abc import Sequence
9
+ from pathlib import Path
10
+ from typing import Any
11
+ from uuid import uuid4
12
+
13
+ import numpy as np
14
+ from numpy.typing import NDArray
15
+
16
+ from ...cmvm.types import CombLogic, Pipeline, _minimal_kif
17
+ from ...trace.pipeline import to_pipeline
18
+ from .. import rtl
19
+
20
+
21
+ def get_io_kifs(sol: CombLogic | Pipeline):
22
+ inp_kifs = tuple(zip(*map(_minimal_kif, sol.inp_qint)))
23
+ out_kifs = tuple(zip(*map(_minimal_kif, sol.out_qint)))
24
+ return np.array(inp_kifs, np.int8), np.array(out_kifs, np.int8)
25
+
26
+
27
+ def binder_gen(csol: Pipeline | CombLogic, module_name: str, II: int = 1, latency_multiplier: int = 1):
28
+ k_in, i_in, f_in = zip(*map(_minimal_kif, csol.inp_qint))
29
+ k_out, i_out, f_out = zip(*map(_minimal_kif, csol.out_qint))
30
+ max_inp_bw = max(k_in) + max(i_in) + max(f_in)
31
+ max_out_bw = max(k_out) + max(i_out) + max(f_out)
32
+ if isinstance(csol, CombLogic):
33
+ II = latency = 0
34
+ else:
35
+ latency = len(csol.solutions) * latency_multiplier
36
+
37
+ n_in, n_out = csol.shape
38
+ return f"""#include <cstddef>
39
+ #include "binder_util.hh"
40
+ #include "V{module_name}.h"
41
+
42
+ struct {module_name}_config {{
43
+ static const size_t N_inp = {n_in};
44
+ static const size_t N_out = {n_out};
45
+ static const size_t max_inp_bw = {max_inp_bw};
46
+ static const size_t max_out_bw = {max_out_bw};
47
+ static const size_t II = {II};
48
+ static const size_t latency = {latency};
49
+ typedef V{module_name} dut_t;
50
+ }};
51
+
52
+ extern "C" {{
53
+ bool openmp_enabled() {{
54
+ return _openmp;
55
+ }}
56
+
57
+ void inference(int32_t *c_inp, int32_t *c_out, size_t n_samples, size_t n_threads) {{
58
+ batch_inference<{module_name}_config>(c_inp, c_out, n_samples, n_threads);
59
+ }}
60
+ }}
61
+ """
62
+
63
+
64
+ class at_path:
65
+ def __init__(self, path: str | Path):
66
+ self._path = Path(path)
67
+ self._orig_cwd = None
68
+
69
+ def __enter__(self):
70
+ self._orig_cwd = Path.cwd()
71
+ os.chdir(self._path)
72
+
73
+ def __exit__(self, exc_type, exc_value, traceback):
74
+ os.chdir(self._orig_cwd) # type: ignore
75
+
76
+
77
+ class RTLModel:
78
+ def __init__(
79
+ self,
80
+ solution: CombLogic | Pipeline,
81
+ prj_name: str,
82
+ path: str | Path,
83
+ flavor: str = 'verilog',
84
+ latency_cutoff: float = -1,
85
+ print_latency: bool = True,
86
+ part_name: str = 'xcvu13p-flga2577-2-e',
87
+ clock_period: float = 5,
88
+ clock_uncertainty: float = 0.1,
89
+ io_delay_minmax: tuple[float, float] = (0.2, 0.4),
90
+ register_layers: int = 1,
91
+ ):
92
+ self._flavor = flavor.lower()
93
+ self._solution = solution
94
+ self._path = Path(path).resolve()
95
+ self._prj_name = prj_name
96
+ self._latency_cutoff = latency_cutoff
97
+ self._print_latency = print_latency
98
+ self.__src_root = Path(rtl.__file__).parent
99
+ self._part_name = part_name
100
+ self._clock_period = clock_period
101
+ self._clock_uncertainty = clock_uncertainty
102
+ self._io_delay_minmax = io_delay_minmax
103
+ self._register_layers = register_layers
104
+ self._place_holder = False
105
+
106
+ assert self._flavor in ('vhdl', 'verilog'), f'Unsupported flavor {flavor}, only vhdl and verilog are supported.'
107
+
108
+ self._pipe = solution if isinstance(solution, Pipeline) else None
109
+ if latency_cutoff > 0 and self._pipe is None:
110
+ assert isinstance(solution, CombLogic)
111
+ self._pipe = to_pipeline(solution, latency_cutoff, verbose=False)
112
+
113
+ if self._pipe is not None:
114
+ # get actual latency cutoff
115
+ latency_cutoff = int(max(max(st.latency) / (i + 1) for i, st in enumerate(self._pipe.solutions)))
116
+ self._latency_cutoff = latency_cutoff
117
+
118
+ self._lib = None
119
+ self._uuid = None
120
+
121
+ def write(self, metadata: None | dict[str, Any] = None):
122
+ """Write the RTL project to the specified path.
123
+
124
+ Parameters
125
+ ----------
126
+ metadata : dict[str, Any] | None, optional
127
+ Additional metadata to write to `metadata.json`, by default None
128
+ """
129
+
130
+ flavor = self._flavor
131
+ suffix = 'v' if flavor == 'verilog' else 'vhd'
132
+ if flavor == 'vhdl':
133
+ from .vhdl import comb_logic_gen, generate_io_wrapper, pipeline_logic_gen
134
+ else: # verilog
135
+ from .verilog import comb_logic_gen, generate_io_wrapper, pipeline_logic_gen
136
+
137
+ from .verilog.comb import table_mem_gen
138
+
139
+ (self._path / 'src/static').mkdir(parents=True, exist_ok=True)
140
+ (self._path / 'sim').mkdir(exist_ok=True)
141
+ (self._path / 'model').mkdir(exist_ok=True)
142
+ (self._path / 'src/memfiles').mkdir(exist_ok=True)
143
+
144
+ # Build scripts
145
+ for path in (self.__src_root).glob('common_source/build_*_prj.tcl'):
146
+ with open(path) as f:
147
+ tcl = f.read()
148
+ tcl = tcl.replace('$::env(DEVICE)', self._part_name)
149
+ tcl = tcl.replace('$::env(PROJECT_NAME)', self._prj_name)
150
+ tcl = tcl.replace('$::env(SOURCE_TYPE)', flavor)
151
+ with open(self._path / path.name, 'w') as f:
152
+ f.write(tcl)
153
+
154
+ if self._pipe is not None: # Pipeline
155
+ if not self._place_holder:
156
+ # Main logic
157
+ codes = pipeline_logic_gen(self._pipe, self._prj_name, self._print_latency, register_layers=self._register_layers)
158
+
159
+ # Table memory files
160
+ memfiles: dict[str, str] = {}
161
+ for comb in self._pipe.solutions:
162
+ memfiles.update(table_mem_gen(comb))
163
+
164
+ for k, v in codes.items():
165
+ with open(self._path / f'src/{k}.{suffix}', 'w') as f:
166
+ f.write(v)
167
+ else:
168
+ memfiles = {}
169
+
170
+ # Timing constraint
171
+ for fmt in ('xdc', 'sdc'):
172
+ with open(self.__src_root / f'common_source/template.{fmt}') as f:
173
+ constraint = f.read()
174
+ constraint = constraint.replace('$::env(CLOCK_PERIOD)', str(self._clock_period))
175
+ constraint = constraint.replace('$::env(UNCERTAINITY_SETUP)', str(self._clock_uncertainty))
176
+ constraint = constraint.replace('$::env(UNCERTAINITY_HOLD)', str(self._clock_uncertainty))
177
+ constraint = constraint.replace('$::env(DELAY_MAX)', str(self._io_delay_minmax[1]))
178
+ constraint = constraint.replace('$::env(DELAY_MIN)', str(self._io_delay_minmax[0]))
179
+ with open(self._path / f'src/{self._prj_name}.{fmt}', 'w') as f:
180
+ f.write(constraint)
181
+
182
+ # C++ binder w/ HDL wrapper for uniform bw
183
+ binder = binder_gen(self._pipe, f'{self._prj_name}_wrapper', 1, self._register_layers)
184
+
185
+ # Verilog IO wrapper (non-uniform bw to uniform one, clk passthrough)
186
+ io_wrapper = generate_io_wrapper(self._pipe, self._prj_name, True)
187
+
188
+ self._pipe.save(self._path / 'model/pipeline.json')
189
+ else: # Comb
190
+ assert isinstance(self._solution, CombLogic)
191
+
192
+ if not self._place_holder:
193
+ # Table memory files
194
+ memfiles = table_mem_gen(self._solution)
195
+
196
+ # Main logic
197
+ code = comb_logic_gen(self._solution, self._prj_name, self._print_latency, '`timescale 1ns/1ps')
198
+ with open(self._path / f'src/{self._prj_name}.{suffix}', 'w') as f:
199
+ f.write(code)
200
+ else:
201
+ memfiles = {}
202
+
203
+ # Verilog IO wrapper (non-uniform bw to uniform one, no clk)
204
+ io_wrapper = generate_io_wrapper(self._solution, self._prj_name, False)
205
+ binder = binder_gen(self._solution, f'{self._prj_name}_wrapper')
206
+
207
+ # Write table memory files
208
+ for name, mem in memfiles.items():
209
+ with open(self._path / 'src/memfiles' / name, 'w') as f:
210
+ f.write(mem)
211
+
212
+ with open(self._path / f'src/{self._prj_name}_wrapper.{suffix}', 'w') as f:
213
+ f.write(io_wrapper)
214
+ with open(self._path / f'sim/{self._prj_name}_wrapper_binder.cc', 'w') as f:
215
+ f.write(binder)
216
+
217
+ # Common resource copy
218
+ for path in self.__src_root.glob(f'{flavor}/source/*.{suffix}'):
219
+ shutil.copy(path, self._path / 'src/static')
220
+
221
+ shutil.copy(self.__src_root / 'common_source/build_binder.mk', self._path / 'sim')
222
+ shutil.copy(self.__src_root / 'common_source/ioutil.hh', self._path / 'sim')
223
+ shutil.copy(self.__src_root / 'common_source/binder_util.hh', self._path / 'sim')
224
+ self._solution.save(self._path / 'model/comb.json')
225
+ with open(self._path / 'metadata.json', 'w') as f:
226
+ _metadata = {'cost': self._solution.cost, 'flavor': self._flavor}
227
+ if self._pipe is not None:
228
+ _metadata['latency'] = len(self._pipe[0])
229
+ _metadata['reg_bits'] = self._pipe.reg_bits
230
+
231
+ if metadata is not None:
232
+ metadata.update(_metadata)
233
+ _metadata = metadata
234
+
235
+ f.write(json.dumps(_metadata))
236
+
237
+ def _compile(self, verbose=False, openmp=True, nproc=None, o3: bool = False, clean=True, _env: dict[str, str] | None = None):
238
+ """Same as compile, but will not write to the library
239
+
240
+ Parameters
241
+ ----------
242
+ verbose : bool, optional
243
+ Verbose output, by default False
244
+ openmp : bool, optional
245
+ Enable openmp, by default True
246
+ nproc : int | None, optional
247
+ Number of processes to use for compilation, by default None
248
+ If None, will use the number of CPU cores, but not more than 32.
249
+ o3 : bool | None, optional
250
+ Turn on -O3 flag, by default False
251
+ clean : bool, optional
252
+ Remove obsolete shared object files and `obj_dir`, by default True
253
+
254
+ Raises
255
+ ------
256
+ RuntimeError
257
+ If compilation fails
258
+ """
259
+
260
+ self._uuid = str(uuid4())
261
+ args = ['make', '-f', 'build_binder.mk']
262
+ env = os.environ.copy()
263
+ env['VM_PREFIX'] = f'{self._prj_name}_wrapper'
264
+ env['STAMP'] = self._uuid
265
+ env['EXTRA_CXXFLAGS'] = '-fopenmp' if openmp else ''
266
+ env['VERILATOR_FLAGS'] = '-Wall' if self._flavor == 'verilog' else ''
267
+ if _env is not None:
268
+ env.update(_env)
269
+ if nproc is not None:
270
+ env['N_JOBS'] = str(nproc)
271
+ if o3:
272
+ args.append('fast')
273
+
274
+ if clean:
275
+ m = re.compile(r'^lib.*[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}\.so$')
276
+ for p in (self._path / 'sim').iterdir():
277
+ if not p.is_dir() and m.match(p.name):
278
+ p.unlink()
279
+ subprocess.run(
280
+ ['make', '-f', 'build_binder.mk', 'clean'],
281
+ env=env,
282
+ cwd=self._path / 'sim',
283
+ check=True,
284
+ capture_output=not verbose,
285
+ )
286
+
287
+ try:
288
+ r = subprocess.run(args, env=env, check=True, cwd=self._path / 'sim', capture_output=not verbose)
289
+ except subprocess.CalledProcessError as e:
290
+ print(e.stderr.decode(), file=sys.stderr)
291
+ print(e.stdout.decode(), file=sys.stdout)
292
+ raise RuntimeError('Compilation failed!!') from e
293
+ if r.returncode != 0:
294
+ print(r.stderr.decode(), file=sys.stderr)
295
+ print(r.stdout.decode(), file=sys.stderr)
296
+ raise RuntimeError('Compilation failed!!')
297
+
298
+ if clean:
299
+ subprocess.run(['rm', '-rf', 'obj_dir'], cwd=self._path / 'sim', check=True, capture_output=not verbose)
300
+
301
+ self._load_lib(self._uuid)
302
+
303
+ def _load_lib(self, uuid: str | None = None):
304
+ uuid = uuid if uuid is not None else self._uuid
305
+ if uuid is None:
306
+ # load .so if there is only one, otherwise raise an error
307
+ libs = list(self._path.glob(f'sim/lib{self._prj_name}_wrapper_*.so'))
308
+ if len(libs) == 0:
309
+ raise RuntimeError(f'Cannot load library, found {len(libs)} libraries in {self._path}')
310
+ uuid = libs[0].name.split('_')[-1].split('.', 1)[0]
311
+ self._uuid = uuid
312
+ lib_path = self._path / f'sim/lib{self._prj_name}_wrapper_{uuid}.so'
313
+ if not lib_path.exists():
314
+ raise RuntimeError(f'Library {lib_path} does not exist')
315
+ self._lib = ctypes.CDLL(str(lib_path))
316
+
317
+ def compile(
318
+ self,
319
+ verbose=False,
320
+ openmp=True,
321
+ nproc: int | None = None,
322
+ o3: bool = False,
323
+ clean=True,
324
+ metadata: None | dict[str, Any] = None,
325
+ ):
326
+ """Compile the generated code to a emulator for logic simulation.
327
+
328
+ Parameters
329
+ ----------
330
+ verbose : bool, optional
331
+ Verbose output, by default False
332
+ openmp : bool, optional
333
+ Enable openmp, by default True
334
+ nproc : int | None, optional
335
+ Number of processes to use for compilation, by default None
336
+ If None, will use the number of CPU cores, but not more than 32.
337
+ o3 : bool | None, optional
338
+ Turn on -O3 flag, by default False
339
+ clean : bool, optional
340
+ Remove obsolete shared object files and `obj_dir`, by default True
341
+ metadata : dict[str, Any] | None, optional
342
+ Additional metadata to write to `metadata.json`, by default None
343
+
344
+ Raises
345
+ ------
346
+ RuntimeError
347
+ If compilation fails
348
+ """
349
+
350
+ self.write(metadata=metadata)
351
+ self._compile(verbose=verbose, openmp=openmp, nproc=nproc, o3=o3, clean=clean)
352
+
353
+ def predict(self, data: NDArray | Sequence[NDArray], n_threads: int = 0) -> NDArray[np.float32]:
354
+ """Run the model on the input data.
355
+
356
+ Parameters
357
+ ----------
358
+ data : NDArray[np.floating]|Sequence[NDArray[np.floating]]
359
+ Input data to the model. The shape is ignored, and the number of samples is
360
+ determined by the size of the data.
361
+
362
+ Returns
363
+ -------
364
+ NDArray[np.float64]
365
+ Output of the model in shape (n_samples, output_size).
366
+ """
367
+
368
+ if isinstance(data, Sequence):
369
+ data = np.concatenate([a.reshape(a.shape[0], -1) for a in data], axis=-1)
370
+
371
+ assert self._lib is not None, 'Library not loaded, call .compile() first.'
372
+ inp_size, out_size = self._solution.shape
373
+
374
+ assert data.size % inp_size == 0, f'Input size {data.size} is not divisible by {inp_size}'
375
+ n_sample = data.size // inp_size
376
+
377
+ kifs_in, kifs_out = get_io_kifs(self._solution)
378
+ k_in, i_in, f_in = map(np.max, kifs_in)
379
+ k_out, i_out, f_out = map(np.max, kifs_out)
380
+ assert k_in + i_in + f_in <= 32, "Padded inp bw doesn't fit in int32. Emulation not supported"
381
+ assert k_out + i_out + f_out <= 32, "Padded out bw doesn't fit in int32. Emulation not supported"
382
+
383
+ inp_data = np.empty(n_sample * inp_size, dtype=np.int32)
384
+ out_data = np.empty(n_sample * out_size, dtype=np.int32)
385
+
386
+ # Convert to int32 matching the LSB position
387
+ inp_data[:] = np.floor(data.ravel() * 2.0**f_in)
388
+
389
+ inp_buf = inp_data.ctypes.data_as(ctypes.POINTER(ctypes.c_int32))
390
+ out_buf = out_data.ctypes.data_as(ctypes.POINTER(ctypes.c_int32))
391
+
392
+ with at_path(self._path / 'src/memfiles'):
393
+ self._lib.inference(inp_buf, out_buf, n_sample, n_threads)
394
+
395
+ # Unscale the output int32 to recover fp values
396
+ k, i, f = np.max(k_out), np.max(i_out), np.max(f_out)
397
+ a, b, c = 2.0 ** (k + i + f), k * 2.0 ** (i + f), 2.0**-f
398
+ return ((out_data.reshape(n_sample, out_size) + b) % a - b) * c.astype(np.float32)
399
+
400
+ def __repr__(self):
401
+ inp_size, out_size = self._solution.shape
402
+ cost = round(self._solution.cost)
403
+ kifs_in, kifs_out = get_io_kifs(self._solution)
404
+ in_bits, out_bits = np.sum(kifs_in), np.sum(kifs_out)
405
+ if self._pipe is not None:
406
+ n_stage = len(self._pipe[0])
407
+ delay_suffix = '' if self._register_layers == 1 else f'x {self._register_layers} '
408
+ lat_cutoff = self._latency_cutoff
409
+ reg_bits = self._pipe.reg_bits
410
+ spec = f"""Top Module: {self._prj_name}\n====================
411
+ {inp_size} ({in_bits} bits) -> {out_size} ({out_bits} bits)
412
+ {n_stage} {delay_suffix}stages @ max_delay={lat_cutoff}
413
+ Estimated cost: {cost} LUTs, {reg_bits} FFs"""
414
+
415
+ else:
416
+ spec = f"""Top Module: {self._prj_name}\n====================
417
+ {inp_size} ({in_bits} bits) -> {out_size} ({out_bits} bits)
418
+ combinational @ delay={self._solution.latency}
419
+ Estimated cost: {cost} LUTs"""
420
+
421
+ is_compiled = self._lib is not None
422
+ if is_compiled:
423
+ assert self._uuid is not None
424
+ openmp = 'with OpenMP' if self._lib.openmp_enabled() else '' # type: ignore
425
+ spec += f'\nEmulator is compiled {openmp} ({self._uuid[-12:]})'
426
+ else:
427
+ spec += '\nEmulator is **not compiled**'
428
+ return spec
429
+
430
+
431
+ class VerilogModel(RTLModel):
432
+ def __init__(
433
+ self,
434
+ solution: CombLogic | Pipeline,
435
+ prj_name: str,
436
+ path: str | Path,
437
+ latency_cutoff: float = -1,
438
+ print_latency: bool = True,
439
+ part_name: str = 'xcvu13p-flga2577-2-e',
440
+ clock_period: float = 5,
441
+ clock_uncertainty: float = 0.1,
442
+ io_delay_minmax: tuple[float, float] = (0.2, 0.4),
443
+ register_layers: int = 1,
444
+ ):
445
+ self._hdl_model = super().__init__(
446
+ solution,
447
+ prj_name,
448
+ path,
449
+ 'verilog',
450
+ latency_cutoff,
451
+ print_latency,
452
+ part_name,
453
+ clock_period,
454
+ clock_uncertainty,
455
+ io_delay_minmax,
456
+ register_layers,
457
+ )
458
+
459
+
460
+ class VHDLModel(RTLModel):
461
+ def __init__(
462
+ self,
463
+ solution: CombLogic | Pipeline,
464
+ prj_name: str,
465
+ path: str | Path,
466
+ latency_cutoff: float = -1,
467
+ print_latency: bool = True,
468
+ part_name: str = 'xcvu13p-flga2577-2-e',
469
+ clock_period: float = 5,
470
+ clock_uncertainty: float = 0.1,
471
+ io_delay_minmax: tuple[float, float] = (0.2, 0.4),
472
+ register_layers: int = 1,
473
+ ):
474
+ self._hdl_model = super().__init__(
475
+ solution,
476
+ prj_name,
477
+ path,
478
+ 'vhdl',
479
+ latency_cutoff,
480
+ print_latency,
481
+ part_name,
482
+ clock_period,
483
+ clock_uncertainty,
484
+ io_delay_minmax,
485
+ register_layers,
486
+ )
@@ -0,0 +1,10 @@
1
+ from .comb import comb_logic_gen, table_mem_gen
2
+ from .io_wrapper import generate_io_wrapper
3
+ from .pipeline import pipeline_logic_gen
4
+
5
+ __all__ = [
6
+ 'comb_logic_gen',
7
+ 'table_mem_gen',
8
+ 'generate_io_wrapper',
9
+ 'pipeline_logic_gen',
10
+ ]