PyPI - da4ml - Versions diffs - 0.5.1.post1__cp311-cp311-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl - Mend

da4ml 0.5.1.post1__cp311-cp311-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (96) hide show

da4ml/__init__.py +4 -0
da4ml/_binary/__init__.py +15 -0
da4ml/_binary/dais_bin.cpython-311-x86_64-linux-gnu.so +0 -0
da4ml/_binary/dais_bin.pyi +5 -0
da4ml/_cli/__init__.py +30 -0
da4ml/_cli/convert.py +204 -0
da4ml/_cli/report.py +295 -0
da4ml/_version.py +32 -0
da4ml/cmvm/__init__.py +4 -0
da4ml/cmvm/api.py +264 -0
da4ml/cmvm/core/__init__.py +221 -0
da4ml/cmvm/core/indexers.py +83 -0
da4ml/cmvm/core/state_opr.py +284 -0
da4ml/cmvm/types.py +739 -0
da4ml/cmvm/util/__init__.py +7 -0
da4ml/cmvm/util/bit_decompose.py +86 -0
da4ml/cmvm/util/mat_decompose.py +121 -0
da4ml/codegen/__init__.py +9 -0
da4ml/codegen/hls/__init__.py +4 -0
da4ml/codegen/hls/hls_codegen.py +196 -0
da4ml/codegen/hls/hls_model.py +255 -0
da4ml/codegen/hls/source/ap_types/ap_binary.h +78 -0
da4ml/codegen/hls/source/ap_types/ap_common.h +376 -0
da4ml/codegen/hls/source/ap_types/ap_decl.h +212 -0
da4ml/codegen/hls/source/ap_types/ap_fixed.h +360 -0
da4ml/codegen/hls/source/ap_types/ap_fixed_base.h +2354 -0
da4ml/codegen/hls/source/ap_types/ap_fixed_ref.h +718 -0
da4ml/codegen/hls/source/ap_types/ap_fixed_special.h +230 -0
da4ml/codegen/hls/source/ap_types/ap_int.h +330 -0
da4ml/codegen/hls/source/ap_types/ap_int_base.h +1885 -0
da4ml/codegen/hls/source/ap_types/ap_int_ref.h +1346 -0
da4ml/codegen/hls/source/ap_types/ap_int_special.h +223 -0
da4ml/codegen/hls/source/ap_types/ap_shift_reg.h +138 -0
da4ml/codegen/hls/source/ap_types/etc/ap_private.h +7199 -0
da4ml/codegen/hls/source/ap_types/hls_math.h +27 -0
da4ml/codegen/hls/source/ap_types/hls_stream.h +263 -0
da4ml/codegen/hls/source/ap_types/utils/x_hls_utils.h +80 -0
da4ml/codegen/hls/source/binder_util.hh +71 -0
da4ml/codegen/hls/source/build_binder.mk +22 -0
da4ml/codegen/hls/source/vitis_bitshift.hh +32 -0
da4ml/codegen/rtl/__init__.py +15 -0
da4ml/codegen/rtl/common_source/binder_util.hh +99 -0
da4ml/codegen/rtl/common_source/build_binder.mk +34 -0
da4ml/codegen/rtl/common_source/build_quartus_prj.tcl +104 -0
da4ml/codegen/rtl/common_source/build_vivado_prj.tcl +111 -0
da4ml/codegen/rtl/common_source/ioutil.hh +124 -0
da4ml/codegen/rtl/common_source/template.sdc +27 -0
da4ml/codegen/rtl/common_source/template.xdc +30 -0
da4ml/codegen/rtl/rtl_model.py +486 -0
da4ml/codegen/rtl/verilog/__init__.py +10 -0
da4ml/codegen/rtl/verilog/comb.py +239 -0
da4ml/codegen/rtl/verilog/io_wrapper.py +113 -0
da4ml/codegen/rtl/verilog/pipeline.py +67 -0
da4ml/codegen/rtl/verilog/source/lookup_table.v +27 -0
da4ml/codegen/rtl/verilog/source/multiplier.v +37 -0
da4ml/codegen/rtl/verilog/source/mux.v +58 -0
da4ml/codegen/rtl/verilog/source/negative.v +31 -0
da4ml/codegen/rtl/verilog/source/shift_adder.v +59 -0
da4ml/codegen/rtl/vhdl/__init__.py +9 -0
da4ml/codegen/rtl/vhdl/comb.py +206 -0
da4ml/codegen/rtl/vhdl/io_wrapper.py +120 -0
da4ml/codegen/rtl/vhdl/pipeline.py +71 -0
da4ml/codegen/rtl/vhdl/source/lookup_table.vhd +52 -0
da4ml/codegen/rtl/vhdl/source/multiplier.vhd +40 -0
da4ml/codegen/rtl/vhdl/source/mux.vhd +102 -0
da4ml/codegen/rtl/vhdl/source/negative.vhd +35 -0
da4ml/codegen/rtl/vhdl/source/shift_adder.vhd +101 -0
da4ml/converter/__init__.py +63 -0
da4ml/converter/hgq2/__init__.py +3 -0
da4ml/converter/hgq2/layers/__init__.py +11 -0
da4ml/converter/hgq2/layers/_base.py +132 -0
da4ml/converter/hgq2/layers/activation.py +81 -0
da4ml/converter/hgq2/layers/attn.py +148 -0
da4ml/converter/hgq2/layers/batchnorm.py +15 -0
da4ml/converter/hgq2/layers/conv.py +149 -0
da4ml/converter/hgq2/layers/dense.py +39 -0
da4ml/converter/hgq2/layers/ops.py +246 -0
da4ml/converter/hgq2/layers/pool.py +107 -0
da4ml/converter/hgq2/layers/table.py +176 -0
da4ml/converter/hgq2/parser.py +161 -0
da4ml/trace/__init__.py +6 -0
da4ml/trace/fixed_variable.py +965 -0
da4ml/trace/fixed_variable_array.py +600 -0
da4ml/trace/ops/__init__.py +13 -0
da4ml/trace/ops/einsum_utils.py +305 -0
da4ml/trace/ops/quantization.py +74 -0
da4ml/trace/ops/reduce_utils.py +105 -0
da4ml/trace/pipeline.py +181 -0
da4ml/trace/tracer.py +186 -0
da4ml/typing/__init__.py +3 -0
da4ml-0.5.1.post1.dist-info/METADATA +85 -0
da4ml-0.5.1.post1.dist-info/RECORD +96 -0
da4ml-0.5.1.post1.dist-info/WHEEL +6 -0
da4ml-0.5.1.post1.dist-info/entry_points.txt +3 -0
da4ml-0.5.1.post1.dist-info/sboms/auditwheel.cdx.json +1 -0
da4ml.libs/libgomp-e985bcbb.so.1.0.0 +0 -0

da4ml/cmvm/util/__init__.py ADDED Viewed

@@ -0,0 +1,7 @@
+from .bit_decompose import csd_decompose
+from .mat_decompose import kernel_decompose
+__all__ = [
+    'csd_decompose',
+    'kernel_decompose',
+]

da4ml/cmvm/util/bit_decompose.py ADDED Viewed

@@ -0,0 +1,86 @@
+import numpy as np
+from numba import jit
+from numpy.typing import NDArray
+@jit
+def _volatile_int_arr_to_csd(x: NDArray) -> NDArray[np.int8]:
+    x = x
+    N = np.max(np.ceil(np.log2(np.abs(x) * 1.5 + 1e-19)))
+    N = int(max(N, 1))
+    buf = np.zeros((*np.shape(x), N), dtype=np.int8)
+    for n in range(N - 1, -1, -1):
+        _2pn = 2**n
+        thres = _2pn / 1.5
+        bit = (x > thres).astype(np.int8)
+        bit -= (x < -thres).astype(np.int8)
+        x -= _2pn * bit.astype(x.dtype)
+        buf[..., n] = bit
+    return buf
+@jit(error_model='numpy')
+def _shift_centering(arr: NDArray):
+    low, high = -64, 64
+    if np.all(arr == 0):
+        high = low = 0
+    while high - low > 1:
+        mid = (high + low) // 2
+        xs = arr * (2.0**mid)
+        if np.all(xs == np.floor(xs)):
+            high = mid
+        else:
+            low = mid
+    return -high
+@jit(error_model='numpy')
+def shift_centering(arr: NDArray, axis: int):
+    n = arr.shape[axis]
+    shifts = np.empty(n, dtype=np.int8)
+    for i in range(n):
+        shifts[i] = _shift_centering(arr.take(i, axis=axis))
+    return shifts
+@jit
+def _center(arr: NDArray):
+    shift1 = shift_centering(arr, 1)  # d_out
+    arr = arr * (2.0**-shift1)
+    shift0 = shift_centering(arr, 0)  # d_in
+    arr = arr * (2.0 ** -shift0[:, None])
+    return arr, shift0.astype(np.int8), shift1.astype(np.int8)
+@jit(cache=True)
+def csd_decompose(arr: NDArray, center=True):
+    """
+    Convert an 2D array to CSD representation.
+    Parameters
+    ----------
+    arr : ndarray
+        Input array to be converted.
+    center : bool, optional
+        If True, the array is centered before conversion. Default is True.
+        If False, the function may accept non-2D arrays.
+    Returns
+    -------
+    csd : ndarray
+        CSD representation of the input array after centering, if center is True.
+    shift0 : ndarray
+        Shift values for the first axis.
+    shift1 : ndarray
+        Shift values for the second axis.
+    """
+    if center:
+        arr, shift0, shift1 = _center(arr)
+    else:
+        shift0 = np.zeros(arr.shape[0], dtype=np.int8)
+        shift1 = np.zeros(arr.shape[1], dtype=np.int8)
+        arr = arr.copy()
+    csd = _volatile_int_arr_to_csd(arr)
+    return csd, shift0, shift1

da4ml/cmvm/util/mat_decompose.py ADDED Viewed

@@ -0,0 +1,121 @@
+from math import ceil, log2
+import numpy as np
+from numba import jit
+from .bit_decompose import _center, _volatile_int_arr_to_csd
+@jit
+def prim_mst_dc(cost_mat: np.ndarray, dc: int = -1):
+    """Minimum Spanning Tree (MST) using Prim's algorithm with a delay constraint. May not be optimal.
+    Always start from the root node (0).
+    Parameters
+    ----------
+    cost_mat : np.ndarray
+        The adjacency matrix of the graph, where cost_mat[i, j] is the cost of the edge between i and j.
+    dc : int, optional
+        The delay constraint, by default -1
+        If -1, no delay constraint is applied.
+        Delay of each edge is ceiling(log2(cost_mat[i, j])).
+        Delay from the root node to any node is the **maximum** latency of each edge connecting in between,
+        plus ceiling(log2(#number of connection edges)).
+        Latency is **NOT** the sum of the latencies.
+    Returns
+    -------
+    np.ndarray
+        The adjacency list of the MST, where each row is a pair of nodes (parent, child).
+    """
+    N = len(cost_mat)
+    lat_mat = np.ceil(np.log2(np.maximum(cost_mat, 1)))
+    parent = np.full(N, -2, dtype=np.int32)  # -2: not visited, -1: root
+    parent[0] = -1
+    idxs = np.arange(N)
+    mapping = np.empty((N - 1, 2), dtype=np.int32)
+    latency = np.zeros((N,), dtype=np.int32)
+    if dc >= 0:
+        _dc = (2**dc - 1) + ceil(log2(np.max(cost_mat[0]) + 1e-32))
+    else:
+        _dc = -1
+    for n_impl in range(1, N):
+        implemented = parent != -2
+        _cost = cost_mat[~implemented][:, implemented]
+        if dc >= 0:
+            _lat = lat_mat[~implemented][:, implemented]
+            _cost = np.where(np.maximum(_lat, latency[implemented]) + 1 <= _dc, _cost, np.iinfo(_cost.dtype).max // 2)
+        _idx = int(np.argmin(_cost))
+        _i, _j = _idx // n_impl, _idx % n_impl
+        i, j = idxs[~implemented][_i], idxs[implemented][_j]
+        parent[i] = j
+        mapping[n_impl - 1, 0] = j
+        mapping[n_impl - 1, 1] = i
+        latency[i] = max(lat_mat[i, j], latency[j]) + 1  # type: ignore
+    return mapping
+@jit(cache=True)
+def kernel_decompose(kernel: np.ndarray, dc: int = -2):
+    """Decompose a 2D kernel matrix into two matrices with the delay-constrained approx MST.
+    Parameters
+    ----------
+    kernel : np.ndarray
+        The input kernel matrix to decompose.
+    dc : int, optional
+        Delay constraint, by default -1
+        If -2, no delay constraint is applied.
+        If -1, return trivial decomposition (m0 = kernel, m1 = I).
+        The delay constraint limits the maximum latency (hops) of the decomposed
+        multiplication structure.
+    Returns
+    -------
+    tuple[np.ndarray, np.ndarray]
+        The decomposed matrices (m0, m1): kernel = m0 @ m1
+    """
+    kernel, shift0, shift1 = _center(kernel)
+    scale0, scale1 = 2.0**shift0, 2.0**shift1
+    m, n = kernel.shape[0], kernel.shape[1] + 1
+    mat_aug = np.zeros((m, n), dtype=kernel.dtype)
+    mat_aug[:, 1:] = kernel
+    diff0 = mat_aug[:, :, None] - mat_aug[:, None, :]
+    diff1 = mat_aug[:, :, None] + mat_aug[:, None, :]
+    dist0 = np.sum(np.sum(_volatile_int_arr_to_csd(diff0) != 0, axis=3), axis=0)
+    dist1 = np.sum(np.sum(_volatile_int_arr_to_csd(diff1) != 0, axis=3), axis=0)
+    sign = np.where(dist1 - dist0 < 0, -1, 1)
+    dist = np.minimum(dist0, dist1)
+    mapping = prim_mst_dc(dist, dc=dc)
+    n_in, n_out = kernel.shape
+    m0, m1 = np.zeros((n_in, n_out), dtype=kernel.dtype), np.zeros((n_out, n_out), dtype=kernel.dtype)
+    if dc == -1:
+        m0[:] = kernel
+        m1[:] = np.eye(n_out, dtype=kernel.dtype)
+        return m0 * scale0[:, None], m1 * scale1
+    cnt = 0
+    for _from, _to in mapping:
+        col0 = mat_aug[:, _to] - mat_aug[:, _from] * sign[_to, _from]
+        if _from != 0:
+            col1 = m1[:, _from - 1].copy() * sign[_to, _from]
+        else:
+            col1 = np.zeros(n_out, dtype=kernel.dtype)
+        if np.any(col0 != 0):
+            col1[cnt] = 1
+            m0[:, cnt] = col0
+            cnt += 1
+        m1[:, _to - 1] = col1
+    return m0 * scale0[:, None], m1 * scale1

da4ml/codegen/__init__.py ADDED Viewed

@@ -0,0 +1,9 @@
+from .hls import HLSModel
+from .rtl import RTLModel, VerilogModel, VHDLModel
+__all__ = [
+    'HLSModel',
+    'VerilogModel',
+    'VHDLModel',
+    'RTLModel',
+]

da4ml/codegen/hls/__init__.py ADDED Viewed

@@ -0,0 +1,4 @@
+from .hls_codegen import hls_logic_and_bridge_gen
+from .hls_model import HLSModel
+__all__ = ['hls_logic_and_bridge_gen', 'HLSModel']

da4ml/codegen/hls/hls_codegen.py ADDED Viewed

@@ -0,0 +1,196 @@
+from collections.abc import Callable
+from ...cmvm.types import CombLogic, QInterval, _minimal_kif
+from ...trace.fixed_variable import _const_f
+def kif_to_vitis_type(k: bool | int = 1, i: int = 0, f: int = 0):
+    if k == i == f == 0:
+        f = 1
+    return f'ap_{"" if k else "u"}fixed<{k + i + f},{k + i}>'
+def kif_to_hlslib_type(k: bool | int = 1, i: int = 0, f: int = 0):
+    if k == i == f == 0:
+        f = 1
+    return f'ac_fixed<{int(k)},{k + i + f},{k + i}>'
+def kif_to_oneapi_type(k: bool | int = 1, i: int = 0, f: int = 0):
+    # OneAPI requires at least 2 bits for all ac_fixed as of 2025.1
+    return f'ac_fixed<{int(k)},{max(k + i + f, 2)},{k + i}>'
+def get_typestr_fn(flavor: str):
+    match flavor.lower():
+        case 'vitis':
+            typestr_fn = kif_to_vitis_type
+        case 'hlslib':
+            typestr_fn = kif_to_hlslib_type
+        case 'oneapi':
+            typestr_fn = kif_to_oneapi_type
+        case _:
+            raise ValueError(f'Unsupported flavor: {flavor}')
+    return typestr_fn
+def ssa_gen(sol: CombLogic, print_latency: bool, typestr_fn: Callable[[bool | int, int, int], str]):
+    ops = sol.ops
+    all_kifs = list(map(_minimal_kif, (op.qint for op in ops)))
+    all_types = list(map(lambda x: typestr_fn(*x), all_kifs))
+    lines = []
+    ref_count = sol.ref_count
+    for i, op in enumerate(ops):
+        if ref_count[i] == 0:
+            # Skip unused ops
+            continue
+        _type = all_types[i]
+        ref0 = f'v{op.id0}'
+        match op.opcode:
+            case -1:
+                # Input marker
+                val = f'model_inp[{op.id0}]'
+            case 0 | 1:
+                # Common a+/-b<<shift op
+                ref1 = f'bit_shift<{op.data}>(v{op.id1})' if op.data != 0 else f'v{op.id1}'
+                val = f'{ref0} {"-" if op.opcode == 1 else "+"} {ref1}'
+            case 2 | -2:
+                if op.opcode == 2:  # relu(model_inp)
+                    if ops[op.id0].qint.min < 0:
+                        val = f'{ref0} > 0 ? {_type}({ref0}) : {_type}(0)'
+                    else:
+                        val = ref0
+                else:  # relu(-model_inp)
+                    if ops[op.id0].qint.max > 0:
+                        val = f'{ref0} > 0 ? {_type}(0) : {_type}(-{ref0})'
+                    else:
+                        val = f'-{ref0}'
+            case 3 | -3:
+                # Explicit quantization op, done implicitly via assignment
+                val = ref0 if op.opcode == 3 else f'-{ref0}'
+            case 4:
+                # Constant addition
+                _number = op.data * op.qint.step
+                sign, mag = ('-' if _number < 0 else '+'), abs(_number)
+                f = _const_f(mag)
+                const_type_str = typestr_fn(*_minimal_kif(QInterval(mag, mag, 2.0**-f)))
+                val = f'{ref0} {sign} {const_type_str}({mag})'
+            case 5:
+                # Define constant
+                _number = op.data * op.qint.step
+                val = f'{_number}'
+            case 6 | -6:
+                # MSB Mux
+                id_c = op.data & 0xFFFFFFFF
+                bw_k = sum(all_kifs[id_c])
+                shift = (op.data >> 32) & 0xFFFFFFFF
+                shift = shift if shift < 0x80000000 else shift - 0x100000000
+                ref_k = f'v{id_c}[{bw_k - 1}]'
+                sign = '-' if op.opcode == -6 else ''
+                ref1 = f'v{op.id1}' if shift == 0 else f'bit_shift<{shift}>(v{op.id1})'
+                bw0, bw1 = sum(all_kifs[op.id0]), sum(all_kifs[op.id1])
+                if bw0 == 0:
+                    ref0 = '0'
+                if bw1 == 0:
+                    ref1 = '0'
+                val = f'{ref_k} ? {_type}({ref0}) : {_type}({sign}{ref1})'
+            case 7:
+                # Multiplication
+                ref1 = f'v{op.id1}'
+                val = f'{ref0} * {ref1}'
+            case _:
+                raise ValueError(f'Unsupported opcode: {op.opcode}')
+        line = f'{_type} v{i} = {val};'
+        if print_latency:
+            line += f' // {op.latency}'
+        lines.append(line)
+    return lines
+def output_gen(sol: CombLogic, typestr_fn: Callable[[bool | int, int, int], str]):
+    lines = []
+    for i, idx in enumerate(sol.out_idxs):
+        if idx < 0:
+            lines.append(f'model_out[{i}] = 0;')
+            continue
+        _type = typestr_fn(*_minimal_kif(sol.out_qint[i]))
+        shift = sol.out_shifts[i]
+        neg_str = '-' if sol.out_negs[i] else ''
+        if shift == 0:
+            lines.append(f'model_out[{i}] = {_type}({neg_str}v{idx});')
+        else:
+            lines.append(f'model_out[{i}] = {_type}({neg_str}bit_shift<{shift}>(v{idx}));')
+    return lines
+def get_io_types(sol: CombLogic, flavor: str):
+    typestr_fn = get_typestr_fn(flavor)
+    in_kif = map(max, zip(*map(_minimal_kif, sol.inp_qint)))
+    inp_type = typestr_fn(*in_kif)
+    out_kif = map(max, zip(*map(_minimal_kif, sol.out_qint)))
+    out_type = typestr_fn(*out_kif)
+    return inp_type, out_type
+def hls_logic_and_bridge_gen(
+    sol: CombLogic,
+    fn_name: str,
+    flavor: str,
+    pragmas: list[str] | None = None,
+    n_indent: int = 4,
+    n_base_indent: int = 0,
+    print_latency: bool = False,
+):
+    typestr_fn = get_typestr_fn(flavor)
+    inp_t, out_t = get_io_types(sol, flavor)
+    n_in, n_out = sol.shape
+    template_def = 'template <typename inp_t, typename out_t>'
+    fn_signature = f'void {fn_name}(inp_t model_inp[{n_in}], out_t model_out[{n_out}])'
+    pragmas = pragmas or []
+    ssa_lines = ssa_gen(sol, print_latency=print_latency, typestr_fn=typestr_fn)
+    output_lines = output_gen(sol, typestr_fn=typestr_fn)
+    indent = ' ' * n_indent
+    base_indent = indent * n_base_indent
+    body_indent = '\n' + base_indent + indent
+    code = f"""{base_indent}{template_def}
+{base_indent}{fn_signature} {{ // {inp_t} -> {out_t}
+{base_indent + indent}{body_indent.join(pragmas)}
+{body_indent}{body_indent.join(ssa_lines)}
+{body_indent}{body_indent.join(output_lines)}
+{base_indent}}}
+"""
+    bridge = f"""#include "binder_util.hh"
+#include "{fn_name}.hh"
+struct {fn_name}_config {{
+    static const size_t N_inp = {n_in};
+    static const size_t N_out = {n_out};
+    typedef {inp_t} inp_t;
+    typedef {out_t} out_t;
+    constexpr static auto f = {fn_name}<inp_t, out_t>;
+}};
+extern "C" {{
+bool openmp_enabled() {{
+    return _openmp;
+}}
+void inference_f64(double *model_inp, double *model_out, size_t size, size_t n_threads) {{
+    batch_inference<{fn_name}_config, double>(model_inp, model_out, size, n_threads);
+}}
+void inference_f32(float *model_inp, float *model_out, size_t size, size_t n_threads) {{
+    batch_inference<{fn_name}_config, float>(model_inp, model_out, size, n_threads);
+}}
+}}"""
+    return code, bridge

da4ml/codegen/hls/hls_model.py ADDED Viewed

@@ -0,0 +1,255 @@
+import ctypes
+import os
+import re
+import shutil
+import subprocess
+import sys
+from collections.abc import Sequence
+from pathlib import Path
+from typing import TypeVar
+from uuid import uuid4
+import numpy as np
+from numpy.typing import NDArray
+from da4ml.cmvm.types import CombLogic
+from da4ml.codegen.hls.hls_codegen import get_io_types, hls_logic_and_bridge_gen
+from ... import codegen
+from ...cmvm.types import _minimal_kif
+T = TypeVar('T', bound=np.floating)
+class HLSModel:
+    def __init__(
+        self,
+        solution: CombLogic,
+        prj_name: str,
+        path: str | Path,
+        flavor: str = 'vitis',
+        print_latency: bool = True,
+        part_name: str = 'xcvu13p-flga2577-2-e',
+        pragma: Sequence[str] | None = None,
+        clock_period: int = 5,
+        clock_uncertainty: float = 0.1,
+        io_delay_minmax: tuple[float, float] = (0.2, 0.4),
+    ):
+        self._solution = solution
+        self._prj_name = prj_name
+        self._path = Path(path).resolve()
+        self._flavor = flavor.lower()
+        assert self._flavor in ('vitis', 'hlslib', 'oneapi'), f'Unsupported HLS flavor: {self._flavor}'
+        self._print_latency = print_latency
+        self._part_name = part_name
+        self._clock_period = clock_period
+        self._clock_uncertainty = clock_uncertainty
+        self._io_delay_minmax = io_delay_minmax
+        self.__src_root = Path(codegen.__file__).parent
+        self._lib = None
+        self._uuid = None
+        if pragma is None:
+            if self._flavor == 'vitis':
+                self._pragma = (
+                    '#pragma HLS ARRAY_PARTITION variable=inp complete',
+                    '#pragma HLS ARRAY_PARTITION variable=out complete',
+                    '#pragma HLS PIPELINE II=1',
+                )
+            else:
+                self._pragma = ()
+        else:
+            self._pragma = tuple(pragma)
+    def write(self):
+        if not self._path.exists():
+            self._path.mkdir(parents=True, exist_ok=True)
+        template_def, bridge = hls_logic_and_bridge_gen(
+            self._solution,
+            self._prj_name,
+            self._flavor,
+            ['#pragma HLS INLINE'],
+            4,
+            0,
+            self._print_latency,
+        )
+        headers = ['#pragma once', '#include "bitshift.hh"']
+        inp_type, out_type = get_io_types(self._solution, self._flavor)
+        n_in, n_out = len(self._solution.inp_qint), len(self._solution.out_qint)
+        template_signature = (
+            f'template <typename inp_t, typename out_t>\nvoid {self._prj_name}(inp_t inp[{n_in}], out_t out[{n_out}]);'
+        )
+        fn_signature = f'void {self._prj_name}_fn({inp_type} inp[{n_in}], {out_type} out[{n_out}])'
+        with open(self._path / f'{self._prj_name}.hh', 'w') as f:
+            f.write('\n'.join(headers) + '\n\n')
+            f.write(f'{template_signature}\n\n{fn_signature};\n')
+        pragma_str = '\n'.join(self._pragma)
+        cpp_def = f"""
+#include "{self._prj_name}.hh"
+{template_def}
+{fn_signature} {{
+{pragma_str}
+    {self._prj_name}<{inp_type}, {out_type}>(inp, out);
+}}
+"""
+        with open(self._path / f'{self._prj_name}.cc', 'w') as f:
+            f.write(cpp_def)
+        with open(self._path / f'{self._prj_name}_bridge.cc', 'w') as f:
+            f.write(bridge)
+        shutil.copy(self.__src_root / 'hls/source/binder_util.hh', self._path)
+        shutil.copy(self.__src_root / f'hls/source/{self._flavor}_bitshift.hh', self._path / 'bitshift.hh')
+        shutil.copy(self.__src_root / 'hls/source/build_binder.mk', self._path)
+        if self._flavor == 'vitis':
+            shutil.copytree(self.__src_root / 'hls/source/ap_types', self._path / 'ap_types', dirs_exist_ok=True)
+        else:
+            pass
+        self._solution.save(self._path / 'project.json')
+    def _compile(self, verbose=False, openmp=True, o3: bool = False, clean=True):
+        """Same as compile, but will not write to the library
+        Parameters
+        ----------
+        verbose : bool, optional
+            Verbose output, by default False
+        openmp : bool, optional
+            Enable openmp, by default True
+        o3 : bool | None, optional
+            Turn on -O3 flag, by default False
+        clean : bool, optional
+            Remove obsolete shared object files, by default True
+        Raises
+        ------
+        RuntimeError
+            If compilation fails
+        """
+        self._uuid = str(uuid4())
+        args = ['make', '-f', 'build_binder.mk']
+        env = os.environ.copy()
+        env['PRJ_NAME'] = self._prj_name
+        env['STAMP'] = self._uuid
+        env['EXTRA_CXXFLAGS'] = '-fopenmp' if openmp else ''
+        if o3:
+            args.append('fast')
+        if clean:
+            m = re.compile(r'^lib.*[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}\.so$')
+            for p in self._path.iterdir():
+                if not p.is_dir() and m.match(p.name):
+                    p.unlink()
+        try:
+            r = subprocess.run(args, env=env, check=True, cwd=self._path, capture_output=not verbose)
+        except subprocess.CalledProcessError as e:
+            print(e.stderr.decode(), file=sys.stderr)
+            print(e.stdout.decode(), file=sys.stdout)
+            raise RuntimeError('Compilation failed!!') from e
+        if r.returncode != 0:
+            print(r.stderr.decode(), file=sys.stderr)
+            print(r.stdout.decode(), file=sys.stderr)
+            raise RuntimeError('Compilation failed!!')
+        self._load_lib(self._uuid)
+    def _load_lib(self, uuid: str | None = None):
+        uuid = uuid if uuid is not None else self._uuid
+        self._uuid = uuid
+        lib_path = self._path / f'lib{self._prj_name}_{uuid}.so'
+        if not lib_path.exists():
+            raise RuntimeError(f'Library {lib_path} does not exist')
+        self._lib = ctypes.CDLL(str(lib_path))
+    def compile(self, verbose=False, openmp=True, o3: bool = False, clean=True):
+        """Compile the model to a shared object file
+        Parameters
+        ----------
+        verbose : bool, optional
+            Verbose output, by default False
+        openmp : bool, optional
+            Enable openmp, by default True
+        o3 : bool | None, optional
+            Turn on -O3 flag, by default False
+        clean : bool, optional
+            Remove obsolete shared object files, by default True
+        Raises
+        ------
+        RuntimeError
+            If compilation fails
+        """
+        self.write()
+        self._compile(verbose, openmp, o3, clean)
+    def predict(self, data: NDArray[T] | Sequence[NDArray[T]], n_threads: int = 0) -> NDArray[T]:
+        """Run the model on the input data.
+        Parameters
+        ----------
+        data: NDArray[np.floating] | Sequence[NDArray[np.floating]]
+            Input data to the model. The shape is ignored, and the number of samples is
+            determined by the size of the data.
+        Returns
+        -------
+        NDArray[np.floating]
+            Output of the model in shape (n_samples, output_size).
+        """
+        assert self._lib is not None, 'Library not loaded, call .compile() first.'
+        inp_size, out_size = self._solution.shape
+        if isinstance(data, Sequence):
+            data = np.concatenate([a.reshape(a.shape[0], -1) for a in data], axis=-1)
+        dtype = data.dtype
+        if dtype not in (np.float32, np.float64):
+            raise TypeError(f'Unsupported input data type: {dtype}. Expected float32 or float64.')
+        c_dtype = ctypes.c_float if dtype == np.float32 else ctypes.c_double
+        assert data.size % inp_size == 0, f'Input size {data.size} is not divisible by {inp_size}'
+        n_sample = data.size // inp_size
+        inp_data = np.ascontiguousarray(data)
+        out_data = np.empty(n_sample * out_size, dtype=dtype)
+        inp_buf = inp_data.ctypes.data_as(ctypes.POINTER(c_dtype))
+        out_buf = out_data.ctypes.data_as(ctypes.POINTER(c_dtype))
+        if dtype == np.float32:
+            self._lib.inference_f32(inp_buf, out_buf, n_sample, n_threads)
+        else:
+            self._lib.inference_f64(inp_buf, out_buf, n_sample, n_threads)
+        return out_data.reshape(n_sample, out_size)  # type: ignore
+    def __repr__(self):
+        inp_size, out_size = self._solution.shape
+        inp_size, out_size = self._solution.shape
+        cost = round(self._solution.cost)
+        inp_kifs = tuple(zip(*map(_minimal_kif, self._solution.inp_qint)))
+        out_kifs = tuple(zip(*map(_minimal_kif, self._solution.out_qint)))
+        in_bits, out_bits = np.sum(inp_kifs), np.sum(out_kifs)
+        spec = f"""Top Function: {self._prj_name}\n====================
+{inp_size} ({in_bits} bits) -> {out_size} ({out_bits} bits)
+combinational @ delay={self._solution.latency}
+Estimated cost: {cost} LUTs"""
+        is_compiled = self._lib is not None
+        if is_compiled:
+            assert self._uuid is not None
+            openmp = 'with OpenMP' if self._lib.openmp_enabled() else ''  # type: ignore
+            spec += f'\nEmulator is compiled {openmp} ({self._uuid[-12:]})'
+        else:
+            spec += '\nEmulator is **not compiled**'
+        return spec