PyPI - da4ml - Versions diffs - 0.5.1.post1__cp311-cp311-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl - Mend

da4ml 0.5.1.post1__cp311-cp311-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (96) hide show

da4ml/__init__.py +4 -0
da4ml/_binary/__init__.py +15 -0
da4ml/_binary/dais_bin.cpython-311-x86_64-linux-gnu.so +0 -0
da4ml/_binary/dais_bin.pyi +5 -0
da4ml/_cli/__init__.py +30 -0
da4ml/_cli/convert.py +204 -0
da4ml/_cli/report.py +295 -0
da4ml/_version.py +32 -0
da4ml/cmvm/__init__.py +4 -0
da4ml/cmvm/api.py +264 -0
da4ml/cmvm/core/__init__.py +221 -0
da4ml/cmvm/core/indexers.py +83 -0
da4ml/cmvm/core/state_opr.py +284 -0
da4ml/cmvm/types.py +739 -0
da4ml/cmvm/util/__init__.py +7 -0
da4ml/cmvm/util/bit_decompose.py +86 -0
da4ml/cmvm/util/mat_decompose.py +121 -0
da4ml/codegen/__init__.py +9 -0
da4ml/codegen/hls/__init__.py +4 -0
da4ml/codegen/hls/hls_codegen.py +196 -0
da4ml/codegen/hls/hls_model.py +255 -0
da4ml/codegen/hls/source/ap_types/ap_binary.h +78 -0
da4ml/codegen/hls/source/ap_types/ap_common.h +376 -0
da4ml/codegen/hls/source/ap_types/ap_decl.h +212 -0
da4ml/codegen/hls/source/ap_types/ap_fixed.h +360 -0
da4ml/codegen/hls/source/ap_types/ap_fixed_base.h +2354 -0
da4ml/codegen/hls/source/ap_types/ap_fixed_ref.h +718 -0
da4ml/codegen/hls/source/ap_types/ap_fixed_special.h +230 -0
da4ml/codegen/hls/source/ap_types/ap_int.h +330 -0
da4ml/codegen/hls/source/ap_types/ap_int_base.h +1885 -0
da4ml/codegen/hls/source/ap_types/ap_int_ref.h +1346 -0
da4ml/codegen/hls/source/ap_types/ap_int_special.h +223 -0
da4ml/codegen/hls/source/ap_types/ap_shift_reg.h +138 -0
da4ml/codegen/hls/source/ap_types/etc/ap_private.h +7199 -0
da4ml/codegen/hls/source/ap_types/hls_math.h +27 -0
da4ml/codegen/hls/source/ap_types/hls_stream.h +263 -0
da4ml/codegen/hls/source/ap_types/utils/x_hls_utils.h +80 -0
da4ml/codegen/hls/source/binder_util.hh +71 -0
da4ml/codegen/hls/source/build_binder.mk +22 -0
da4ml/codegen/hls/source/vitis_bitshift.hh +32 -0
da4ml/codegen/rtl/__init__.py +15 -0
da4ml/codegen/rtl/common_source/binder_util.hh +99 -0
da4ml/codegen/rtl/common_source/build_binder.mk +34 -0
da4ml/codegen/rtl/common_source/build_quartus_prj.tcl +104 -0
da4ml/codegen/rtl/common_source/build_vivado_prj.tcl +111 -0
da4ml/codegen/rtl/common_source/ioutil.hh +124 -0
da4ml/codegen/rtl/common_source/template.sdc +27 -0
da4ml/codegen/rtl/common_source/template.xdc +30 -0
da4ml/codegen/rtl/rtl_model.py +486 -0
da4ml/codegen/rtl/verilog/__init__.py +10 -0
da4ml/codegen/rtl/verilog/comb.py +239 -0
da4ml/codegen/rtl/verilog/io_wrapper.py +113 -0
da4ml/codegen/rtl/verilog/pipeline.py +67 -0
da4ml/codegen/rtl/verilog/source/lookup_table.v +27 -0
da4ml/codegen/rtl/verilog/source/multiplier.v +37 -0
da4ml/codegen/rtl/verilog/source/mux.v +58 -0
da4ml/codegen/rtl/verilog/source/negative.v +31 -0
da4ml/codegen/rtl/verilog/source/shift_adder.v +59 -0
da4ml/codegen/rtl/vhdl/__init__.py +9 -0
da4ml/codegen/rtl/vhdl/comb.py +206 -0
da4ml/codegen/rtl/vhdl/io_wrapper.py +120 -0
da4ml/codegen/rtl/vhdl/pipeline.py +71 -0
da4ml/codegen/rtl/vhdl/source/lookup_table.vhd +52 -0
da4ml/codegen/rtl/vhdl/source/multiplier.vhd +40 -0
da4ml/codegen/rtl/vhdl/source/mux.vhd +102 -0
da4ml/codegen/rtl/vhdl/source/negative.vhd +35 -0
da4ml/codegen/rtl/vhdl/source/shift_adder.vhd +101 -0
da4ml/converter/__init__.py +63 -0
da4ml/converter/hgq2/__init__.py +3 -0
da4ml/converter/hgq2/layers/__init__.py +11 -0
da4ml/converter/hgq2/layers/_base.py +132 -0
da4ml/converter/hgq2/layers/activation.py +81 -0
da4ml/converter/hgq2/layers/attn.py +148 -0
da4ml/converter/hgq2/layers/batchnorm.py +15 -0
da4ml/converter/hgq2/layers/conv.py +149 -0
da4ml/converter/hgq2/layers/dense.py +39 -0
da4ml/converter/hgq2/layers/ops.py +246 -0
da4ml/converter/hgq2/layers/pool.py +107 -0
da4ml/converter/hgq2/layers/table.py +176 -0
da4ml/converter/hgq2/parser.py +161 -0
da4ml/trace/__init__.py +6 -0
da4ml/trace/fixed_variable.py +965 -0
da4ml/trace/fixed_variable_array.py +600 -0
da4ml/trace/ops/__init__.py +13 -0
da4ml/trace/ops/einsum_utils.py +305 -0
da4ml/trace/ops/quantization.py +74 -0
da4ml/trace/ops/reduce_utils.py +105 -0
da4ml/trace/pipeline.py +181 -0
da4ml/trace/tracer.py +186 -0
da4ml/typing/__init__.py +3 -0
da4ml-0.5.1.post1.dist-info/METADATA +85 -0
da4ml-0.5.1.post1.dist-info/RECORD +96 -0
da4ml-0.5.1.post1.dist-info/WHEEL +6 -0
da4ml-0.5.1.post1.dist-info/entry_points.txt +3 -0
da4ml-0.5.1.post1.dist-info/sboms/auditwheel.cdx.json +1 -0
da4ml.libs/libgomp-e985bcbb.so.1.0.0 +0 -0

da4ml/cmvm/types.py ADDED Viewed

@@ -0,0 +1,739 @@
+import json
+from collections.abc import Sequence
+from decimal import Decimal
+from functools import reduce, singledispatch
+from math import ceil, floor, log2
+from pathlib import Path
+from typing import TYPE_CHECKING, NamedTuple, TypeVar
+import numpy as np
+from numba import jit
+from numpy import float32, int8
+from numpy.typing import NDArray
+from .._binary import dais_interp_run
+if TYPE_CHECKING:
+    from ..trace.fixed_variable import FixedVariable, LookupTable
+class QInterval(NamedTuple):
+    """A class representing a quantized interval: [min, max] with a step size."""
+    min: float
+    max: float
+    step: float
+    @classmethod
+    def from_kif(cls, k: int | bool, i: int, f: int):
+        _high = 2.0**i
+        step = 2.0**-f
+        low, high = -k * step, _high - step
+        return cls(low, high, step)
+    @classmethod
+    def from_precision(cls, prec: 'Precision'):
+        return cls.from_kif(*prec)
+    @property
+    def precision(self):
+        return Precision.from_qint(self)
+    def __repr__(self):
+        return f'[{self.min}, {self.max}, {self.step}]'
+class Precision(NamedTuple):
+    """A class representing the precision of a quantized interval."""
+    keep_negative: bool
+    integers: int
+    fractional: int
+    def __str__(self):
+        k, i, f = self.keep_negative, self.integers, self.fractional
+        return f'fixed({k=}, {i=}, {f=})'
+    def __repr__(self):
+        return str(self)
+    @classmethod
+    def from_qint(cls, qint: QInterval, symmetric: bool = False):
+        return _minimal_kif(qint, symmetric=symmetric)
+    @property
+    def qint(self):
+        return QInterval.from_kif(*self)
+class Op(NamedTuple):
+    """One single operation on the data buffer.
+    Parameters
+    ----------
+    id0: int
+        index of the first operand
+    id1: int
+        index of the second operand, or special opcode if negative
+    opcode: int
+        0: addition, 1: subtraction, 2: relu, 3: quantize, 4: const addition
+    data: int
+        Data to be used in the operation
+    qint: QInterval
+        Quantization interval of the resultant buffer
+    latency: float
+        Latency of the data generated by this operation (t_available)
+    cost: float
+        Cost of the operation
+    """
+    id0: int
+    id1: int
+    opcode: int
+    data: int
+    qint: QInterval
+    latency: float
+    cost: float
+class Pair(NamedTuple):
+    """An operation representing data[id0] +/- data[id1] * 2**shift."""
+    id0: int
+    id1: int
+    sub: bool
+    shift: int
+class DAState(NamedTuple):
+    """Internal state of the DA algorithm."""
+    shifts: tuple[NDArray[int8], NDArray[int8]]
+    expr: list[NDArray[int8]]
+    ops: list[Op]
+    freq_stat: dict[Pair, int]
+    kernel: NDArray[float32]
+def _minimal_kif(qi: QInterval, symmetric: bool = False) -> Precision:
+    """Calculate the minimal KIF for a given QInterval.
+    Parameters
+    ----------
+    qi : QInterval
+        The QInterval to calculate the KIF for.
+    symmetric : bool
+        Only relevant if qi may be negative. If True, -2**i will be regarded as forbidden.
+        May be useful in special cases only.
+        Default is False.
+    Returns
+    -------
+    Precision
+        A named tuple with the KIF values.
+    """
+    if qi.min == qi.max == 0:
+        return Precision(keep_negative=False, integers=0, fractional=0)
+    keep_negative = qi.min < 0
+    fractional = int(-log2(qi.step))
+    int_min, int_max = round(qi.min / qi.step), round(qi.max / qi.step)
+    if symmetric:
+        bits = int(ceil(log2(max(abs(int_min), int_max) + 1)))
+    else:
+        bits = int(ceil(log2(max(abs(int_min), int_max + 1))))
+    integers = bits - fractional
+    return Precision(keep_negative=keep_negative, integers=integers, fractional=fractional)
+if TYPE_CHECKING:
+    def minimal_kif(qi: QInterval, symmetric: bool = False) -> Precision: ...
+else:
+    minimal_kif = jit(_minimal_kif)
+T = TypeVar('T', 'FixedVariable', float, int, np.float32, np.float64, Decimal)
+@singledispatch
+def _relu(v: 'T', i: int | None = None, f: int | None = None, inv: bool = False, round_mode: str = 'TRN') -> 'T':
+    from ..trace.fixed_variable import FixedVariable
+    assert isinstance(v, FixedVariable), f'Unknown type {type(v)} for symbolic relu'
+    if inv:
+        v = -v
+    return v.relu(i, f, round_mode=round_mode)
+@_relu.register(float)
+@_relu.register(int)
+@_relu.register(np.float32)
+@_relu.register(np.float64)
+def _(v, i: int | None = None, f: int | None = None, inv: bool = False, round_mode: str = 'TRN'):
+    if inv:
+        v = -v
+    v = max(0, v)
+    if f is not None:
+        if round_mode.upper() == 'RND':
+            v += 2.0 ** (-f - 1)
+        sf = 2.0**f
+        v = floor(v * sf) / sf
+    if i is not None:
+        v = v % 2.0**i
+    return v
+@_relu.register
+def _(v: Decimal, i: int | None = None, f: int | None = None, inv: bool = False, round_mode: str = 'TRN'):
+    if inv:
+        v = -v
+    v = max(Decimal(0), v)
+    if f is not None:
+        if round_mode.upper() == 'RND':
+            v += Decimal(2) ** (-f - 1)
+        sf = Decimal(2) ** f
+        v = floor(v * sf) / sf
+    if i is not None:
+        v = v % Decimal(2) ** i
+    return v
+@singledispatch
+def _quantize(v: 'T', k: int | bool, i: int, f: int, round_mode: str = 'TRN') -> 'T':
+    from ..trace.fixed_variable import FixedVariable
+    assert isinstance(v, FixedVariable), f'Unknown type {type(v)} for symbolic quantization'
+    return v.quantize(k, i, f, round_mode=round_mode)
+@_quantize.register(float)
+@_quantize.register(int)
+@_quantize.register(np.float32)
+@_quantize.register(np.float64)
+def _(v, k: int | bool, i: int, f: int, round_mode: str = 'TRN'):
+    if round_mode.upper() == 'RND':
+        v += 2.0 ** (-f - 1)
+    b = k + i + f
+    bias = 2.0 ** (b - 1) * k
+    eps = 2.0**-f
+    return eps * ((np.floor(v / eps) + bias) % 2**b - bias)
+@_quantize.register
+def _(v: Decimal, k: int | bool, i: int, f: int, round_mode: str = 'TRN'):
+    if round_mode.upper() == 'RND':
+        v += Decimal(2) ** (-f - 1)
+    b = k + i + f
+    bias = Decimal(2) ** (b - 1) * k
+    eps = Decimal(2) ** -f
+    return eps * ((floor(v / eps) + bias) % Decimal(2) ** b - bias)
+class JSONEncoder(json.JSONEncoder):
+    def default(self, o):
+        if hasattr(o, 'to_dict'):
+            return o.to_dict()
+        super().default(o)
+class CombLogic(NamedTuple):
+    """A combinational logic that describes a series of operations on input data to produce output data.
+    Attributes
+    ----------
+    shape: tuple[int, int]
+        #input, #output
+    inp_shifts: list[int]
+        The shifts that should be applied to the input data.
+    out_idxs: list[int]
+        The indices of the output data in the buffer.
+    out_shifts: list[int]
+        The shifts that should be applied to the output data.
+    out_negs: list[bool]
+        The signs of the output data.
+    ops: list[Op]
+        Core list of operations for generating each buffer element.
+    carry_size: int
+        Size of the carrier for the adder, used for cost and latency estimation.
+    adder_size: int
+        Elementary size of the adder, used for cost and latency estimation.
+    lookup_tables: tuple[LookupTable, ...] | None
+        Lookup table arrays for lookup operations, if any.
+    The core part of the comb logic is the operations in the ops list.
+    For the exact operations executed with Op, refer to the Op class.
+    After all operations are executed, the output data is read from data[op.out_idx] and multiplied by 2**out_shift.
+    """
+    shape: tuple[int, int]
+    inp_shifts: list[int]
+    out_idxs: list[int]
+    out_shifts: list[int]
+    out_negs: list[bool]
+    ops: list[Op]
+    carry_size: int
+    adder_size: int
+    lookup_tables: 'tuple[LookupTable, ...] | None' = None
+    def __call__(self, inp: list | np.ndarray | tuple, quantize=False, debug=False, dump=False):
+        """Executes the solution on the input data.
+        Parameters
+        ----------
+        inp : list | np.ndarray | tuple
+            Input data to be processed. The input data should be a list or numpy array of objects.
+        quantize : bool
+            If True, the input data will be quantized to the output quantization intervals.
+            Only floating point data types are supported when quantize is True.
+            Default is False.
+        debug : bool
+            If True, the function will print debug information about the operations being performed.
+            Default is False.
+        dump : bool
+            If True, the return the whole buffer, without applying the output shifts and signs.
+            Default is False.
+        Returns
+        -------
+        np.ndarray
+            The output data after applying the operations defined in the solution.
+        """
+        from ..trace.fixed_variable import FixedVariable
+        buf = np.empty(len(self.ops), dtype=object)
+        inp = np.asarray(inp)
+        inp_qint = [op.qint for op in self.ops if op.opcode == -1]
+        if quantize:  # TRN and WRAP
+            k, i, f = map(np.array, zip(*map(minimal_kif, inp_qint)))
+            inp = [_quantize(*x, round_mode='TRN') for x in zip(inp, k, i, f)]
+        inp = inp * (2.0 ** np.array(self.inp_shifts))
+        for i, op in enumerate(self.ops):
+            match op.opcode:
+                case -1:  # copy form external buffer
+                    buf[i] = inp[op.id0]
+                case 0 | 1:  # addition
+                    v0, v1 = buf[op.id0], 2.0**op.data * buf[op.id1]
+                    buf[i] = v0 + v1 if op.opcode == 0 else v0 - v1
+                case 2 | -2:  # relu(+/-x)
+                    v = buf[op.id0]
+                    _, _i, _f = _minimal_kif(op.qint)
+                    buf[i] = _relu(v, _i, _f, inv=op.opcode == -2, round_mode='TRN')
+                case 3 | -3:  # quantize(+/-x)
+                    v = buf[op.id0] if op.opcode == 3 else -buf[op.id0]
+                    _k, _i, _f = _minimal_kif(op.qint)
+                    buf[i] = _quantize(v, _k, _i, _f, round_mode='TRN')
+                case 4:  # const addition
+                    bias = op.data * op.qint.step
+                    buf[i] = buf[op.id0] + bias
+                case 5:  # const definition
+                    buf[i] = op.data * op.qint.step  # const definition
+                case 6 | -6:  # MSB Mux
+                    id_c = op.data & 0xFFFFFFFF
+                    k, v0, v1 = buf[id_c], buf[op.id0], buf[op.id1]
+                    shift = (op.data >> 32) & 0xFFFFFFFF
+                    shift = shift if shift < 0x80000000 else shift - 0x100000000
+                    if op.opcode == -6:
+                        v1 = -v1
+                    if isinstance(k, FixedVariable):
+                        buf[i] = k.msb_mux(v0, v1 * 2**shift, op.qint)
+                    else:
+                        qint_k = self.ops[id_c].qint
+                        if qint_k.min < 0:
+                            buf[i] = v0 if k < 0 else v1 * 2.0**shift
+                        else:
+                            _k, _i, _f = _minimal_kif(qint_k)
+                            buf[i] = v0 if k >= 2.0 ** (_i - 1) else v1 * 2.0**shift
+                case 7:
+                    v0, v1 = buf[op.id0], buf[op.id1]
+                    buf[i] = v0 * v1
+                case 8:
+                    v0 = buf[op.id0]
+                    tables = self.lookup_tables
+                    assert tables is not None, 'No lookup table provided for lookup operation'
+                    table = tables[op.data]
+                    buf[i] = table.lookup(v0, self.ops[op.id0].qint)
+                case _:
+                    raise ValueError(f'Unknown opcode {op.opcode} in {op}')
+        sf = 2.0 ** np.array(self.out_shifts, dtype=np.float64)
+        sign = np.where(self.out_negs, -1, 1)
+        out_idx = np.array(self.out_idxs, dtype=np.int32)
+        mask = np.where(out_idx < 0, 0, 1)
+        if debug:
+            operands = []
+            for i, v in enumerate(buf):
+                op = self.ops[i]
+                match op.opcode:
+                    case -1:
+                        op_str = 'inp'
+                    case 0 | 1:
+                        _sign = '-' if op.opcode == 1 else '+'
+                        op_str = f'buf[{op.id0}] {_sign} buf[{op.id1}]<<{op.data}'
+                    case 2 | -2:
+                        _sign = '' if op.opcode == 2 else '-'
+                        op_str = f'relu({_sign}buf[{op.id0}])'
+                    case 3 | -3:
+                        _sign = '' if op.opcode == 3 else '-'
+                        op_str = f'quantize({_sign}buf[{op.id0}])'
+                    case 4:
+                        op_str = f'buf[{op.id0}] + {op.data * op.qint.step}'
+                    case 5:
+                        op_str = f'const {op.data * op.qint.step}'
+                    case 6 | -6:
+                        _sign = '-' if op.opcode == -6 else ''
+                        op_str = f'msb(buf[{op.data}]) ? buf[{op.id0}] : {_sign}buf[{op.id1}]'
+                    case 7:
+                        op_str = f'buf[{op.id0}] * buf[{op.id1}]'
+                    case 8:
+                        op_str = f'tables[{int(op.data)}].lookup(buf[{op.id0}])'
+                    case _:
+                        raise ValueError(f'Unknown opcode {op.opcode} in {op}')
+                result = f'|-> buf[{i}] = {v}'
+                operands.append((op_str, result))
+            max_len = max(len(op[0]) for op in operands)
+            for op_str, result in operands:
+                print(f'{op_str:<{max_len}} {result}')
+        if dump:
+            return buf
+        return buf[out_idx] * sf * sign * mask
+    @property
+    def kernel(self):
+        """the kernel represented by the solution, when applicable."""
+        kernel = np.empty(self.shape, dtype=np.float32)
+        for i, one_hot in enumerate(np.identity(self.shape[0])):
+            kernel[i] = self(one_hot)
+        return kernel
+    @property
+    def cost(self):
+        """Total cost of the solution."""
+        return float(sum(op.cost for op in self.ops))
+    @property
+    def latency(self):
+        """Minimum and maximum latency of the solution."""
+        latency = [self.ops[i].latency for i in self.out_idxs]
+        if len(latency) == 0:
+            return 0.0, 0.0
+        return min(latency), max(latency)
+    def __repr__(self):
+        n_in, n_out = self.shape
+        cost = self.cost
+        lat_min, lat_max = self.latency
+        return f'Solution([{n_in} -> {n_out}], cost={cost}, latency={lat_min}-{lat_max})'
+    @property
+    def out_latency(self):
+        """Latencies of all output elements of the solution."""
+        return [self.ops[i].latency if i >= 0 else 0.0 for i in self.out_idxs]
+    @property
+    def out_qint(self):
+        """Quantization intervals of the output elements."""
+        buf = []
+        for i, idx in enumerate(self.out_idxs):
+            _min, _max, _step = self.ops[idx].qint
+            sf = 2.0 ** self.out_shifts[i]
+            _min, _max, _step = _min * sf, _max * sf, _step * sf
+            if self.out_negs[i]:
+                _min, _max = -_max, -_min
+            buf.append(QInterval(_min, _max, _step))
+        return buf
+    @property
+    def out_kifs(self):
+        """KIFs of all output elements of the solution."""
+        return np.array([_minimal_kif(qi) for qi in self.out_qint]).T
+    @property
+    def inp_latency(self):
+        """Latencies of all input elements of the solution."""
+        return [op.latency for op in self.ops if op.opcode == -1]
+    @property
+    def inp_qint(self):
+        """Quantization intervals of the input elements."""
+        qints = [QInterval(0.0, 0.0, 1.0) for _ in range(self.shape[0])]
+        for op in self.ops:
+            if op.opcode != -1:
+                continue
+            qints[op.id0] = op.qint
+        return qints
+    @property
+    def inp_kifs(self):
+        """KIFs of all input elements of the solution."""
+        return np.array([_minimal_kif(qi) for qi in self.inp_qint]).T
+    def save(self, path: str | Path):
+        """Save the solution to a file."""
+        with open(path, 'w') as f:
+            json.dump(self, f, cls=JSONEncoder)
+    @classmethod
+    def deserialize(cls, data: list):
+        """Load the solution from a file."""
+        ops = []
+        for _op in data[5]:
+            op = Op(*_op[:4], QInterval(*_op[4]), *_op[5:])  # type: ignore
+            ops.append(op)
+        assert len(data) in (8, 9), f'{len(data)}'
+        lookup_tables = data[8] if len(data) > 8 else None
+        if lookup_tables is not None:
+            from ..trace.fixed_variable import LookupTable
+            lookup_tables = tuple(LookupTable.from_dict(tab) for tab in lookup_tables)
+        return cls(
+            shape=tuple(data[0]),
+            inp_shifts=data[1],
+            out_idxs=data[2],
+            out_shifts=data[3],
+            out_negs=data[4],
+            ops=ops,
+            carry_size=data[6],
+            adder_size=data[7],
+            lookup_tables=lookup_tables,
+        )
+    @classmethod
+    def load(cls, path: str | Path):
+        """Load the solution from a file."""
+        with open(path) as f:
+            data = json.load(f)
+        return cls.deserialize(data)
+    @property
+    def ref_count(self) -> np.ndarray:
+        """The number of references to the output elements in the solution."""
+        ref_count = np.zeros(len(self.ops), dtype=np.uint64)
+        for op in self.ops:
+            if op.opcode == -1:
+                continue
+            id0, id1 = op.id0, op.id1
+            if id0 != -1:
+                ref_count[id0] += 1
+            if id1 != -1:
+                ref_count[id1] += 1
+            if op.opcode in (6, -6):
+                # msb_mux operation
+                ref_count[op.data & 0xFFFFFFFF] += 1
+        for i in self.out_idxs:
+            if i < 0:
+                continue
+            ref_count[i] += 1
+        return ref_count
+    def to_binary(self, version: int = 0) -> NDArray[np.int32]:
+        n_in, n_out = self.shape
+        header_size_i32 = 6 + n_in + n_out * 3
+        n_tables = len(self.lookup_tables) if self.lookup_tables is not None else 0
+        header = np.concatenate(
+            [
+                [0, version, n_in, n_out, len(self.ops), n_tables],
+                self.inp_shifts,
+                self.out_idxs,
+                self.out_shifts,
+                self.out_negs,
+            ],
+            axis=0,
+            dtype=np.int32,
+        )
+        assert len(header) == header_size_i32, f'Header size mismatch: {len(header)} != {header_size_i32}'
+        code = np.empty((len(self.ops), 8), dtype=np.int32)
+        for i, op in enumerate(self.ops):
+            buf = code[i]
+            buf[0] = op.opcode
+            buf[1] = op.id0
+            buf[2] = op.id1
+            buf[5:] = _minimal_kif(op.qint)
+            buf_i64 = buf[3:5].view(np.int64)
+            if op.opcode != 8:
+                buf_i64[0] = op.data
+            else:
+                assert self.lookup_tables is not None
+                pad_left = self.lookup_tables[op.data]._get_pads(self.ops[op.id0].qint)[0]
+                buf_i64[0] = (pad_left << 32) | op.data
+        data = np.concatenate([header, code.flatten()])
+        if self.lookup_tables is None:
+            return data
+        tables = [table.table for table in self.lookup_tables]
+        table_sizes = [len(tab) for tab in tables]
+        table_data = np.concatenate([table_sizes] + tables, axis=0, dtype=np.int32)
+        data = np.concatenate([data, table_data])
+        return data
+    def save_binary(self, path: str | Path, version: int = 0):
+        """Dump the solution to a binary file."""
+        data = self.to_binary(version=version)
+        with open(path, 'wb') as f:
+            data.tofile(f)
+    def predict(
+        self,
+        data: NDArray | Sequence[NDArray],
+        n_threads: int = -1,
+    ) -> NDArray[np.float64]:
+        """Predict the output of the solution for a batch of input data with cpp backed DAIS interpreter.
+        Cannot be used if the binary interpreter is not installed.
+        Parameters
+        ----------
+        data : NDArray|Sequence[NDArray]
+            Input data to the model. The shape is ignored, and the number of samples is
+            determined by the size of the data.
+        n_threads: int
+            Number of threads to use for prediction.
+            Negative or zero values will use maximum available threads. Default is -1.
+            If OpenMP is not supported, this parameter is ignored.
+        Returns
+        -------
+        NDArray[np.float64]
+            Output of the model in shape (n_samples, output_size).
+        """
+        if isinstance(data, Sequence):
+            data = np.concatenate([a.reshape(a.shape[0], -1) for a in data], axis=-1)
+        if n_threads == 0:
+            n_threads = -1
+        bin_logic = self.to_binary()
+        return dais_interp_run(bin_logic, data, n_threads)
+class Pipeline(NamedTuple):
+    """A pipeline with II=1,with each stage represented by a CombLogic
+    Attributes
+    ----------
+    solutions: tuple[Solution, ...]
+        A tuple containing the individual Solution objects for each stage of the cascade.
+    Properties
+    ----------
+    kernel: NDArray[float32]
+        Only useful when the pipeline describes a linear operation.
+        The overall kernel matrix which the cascaded solution implements: vec @ kernel = solution(vec).
+        This is calculated as the matrix product of all individual solution kernels.
+    cost: float
+        The total cost of the cascaded solution, computed as the sum of the costs of all stages.
+    latency: tuple[float, float]
+        The minimum and maximum latency of the pipeline, determined by the last stage.
+    inp_qint: list[QInterval]
+        Input quantization intervals
+    inp_lat: list[float]
+        Input latencies
+    in_shift: list[int]
+        Input shifts
+    out_qint: list[QInterval]
+        Output quantization intervals
+    out_lat: list[float]
+        Output latencies
+    out_shift: list[int]
+        Output shifts
+    out_neg: list[bool]
+        Output signs
+    shape: tuple[int, int]
+        The shape of the corresponding kernel matrix.
+    """
+    solutions: tuple[CombLogic, ...]
+    def __call__(self, inp: list | np.ndarray | tuple, quantize=False, debug=False):
+        out = np.asarray(inp)
+        for sol in self.solutions:
+            out = sol(out, quantize=quantize, debug=debug)
+        return out
+    @property
+    def kernel(self):
+        return reduce(lambda x, y: x @ y, [sol.kernel for sol in self.solutions])
+    @property
+    def cost(self):
+        return sum(sol.cost for sol in self.solutions)
+    @property
+    def latency(self):
+        return self.solutions[-1].latency
+    @property
+    def inp_qint(self):
+        return self.solutions[0].inp_qint
+    @property
+    def inp_latency(self):
+        return self.solutions[0].inp_latency
+    @property
+    def out_qint(self):
+        return self.solutions[-1].out_qint
+    @property
+    def out_latencies(self):
+        return self.solutions[-1].out_latency
+    @property
+    def shape(self):
+        return self.solutions[0].shape[0], self.solutions[-1].shape[1]
+    @property
+    def inp_shifts(self):
+        return self.solutions[0].inp_shifts
+    @property
+    def out_shift(self):
+        return self.solutions[-1].out_shifts
+    @property
+    def out_neg(self):
+        return self.solutions[-1].out_negs
+    def __repr__(self) -> str:
+        n_ins = [sol.shape[0] for sol in self.solutions] + [self.shape[1]]
+        shape_str = ' -> '.join(map(str, n_ins))
+        _cost = self.cost
+        lat_min, lat_max = self.latency
+        return f'CascatedSolution([{shape_str}], cost={_cost}, latency={lat_min}-{lat_max})'
+    def save(self, path: str | Path):
+        """Save the solution to a file."""
+        with open(path, 'w') as f:
+            json.dump(self, f, cls=JSONEncoder)
+    @classmethod
+    def deserialize(cls, data: dict):
+        """Load the solution from a file."""
+        return cls(solutions=tuple(CombLogic.deserialize(sol) for sol in data[0]))
+    @classmethod
+    def load(cls, path: str):
+        """Load the solution from a file."""
+        with open(path) as f:
+            data = json.load(f)
+        return cls.deserialize(data)
+    @property
+    def reg_bits(self):
+        """The number of bits used for the register in the solution."""
+        bits = sum(map(sum, (_minimal_kif(qint) for qint in self.inp_qint)))
+        for _sol in self.solutions:
+            kifs = [_minimal_kif(qint) for qint in _sol.out_qint]
+            _bits = sum(map(sum, kifs))
+            bits += _bits
+        return bits