PyPI - da4ml - Versions diffs - 0.3.1__py3-none-any.whl → 0.3.3__py3-none-any.whl - Mend

da4ml 0.3.1py3-none-any.whl → 0.3.3py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of da4ml might be problematic. Click here for more details.

Files changed (27) hide show

da4ml/_version.py +16 -3
da4ml/cmvm/types.py +12 -2
da4ml/codegen/cpp/cpp_codegen.py +5 -2
da4ml/codegen/cpp/source/binder_util.hh +20 -26
da4ml/codegen/cpp/source/vitis_bitshift.hh +5 -3
da4ml/codegen/verilog/comb.py +19 -11
da4ml/codegen/verilog/source/binder_util.hh +8 -6
da4ml/codegen/verilog/source/ioutil.hh +2 -1
da4ml/codegen/verilog/source/multiplier.v +37 -0
da4ml/codegen/verilog/verilog_model.py +2 -3
da4ml/converter/__init__.py +3 -0
da4ml/converter/hgq2/__init__.py +3 -0
da4ml/converter/hgq2/parser.py +63 -11
da4ml/converter/hgq2/replica.py +125 -35
da4ml/trace/fixed_variable.py +137 -20
da4ml/trace/fixed_variable_array.py +59 -7
da4ml/trace/ops/__init__.py +4 -4
da4ml/trace/ops/einsum_utils.py +5 -2
da4ml/trace/ops/reduce_utils.py +4 -2
da4ml/trace/pipeline.py +6 -4
da4ml/trace/tracer.py +27 -13
da4ml-0.3.3.dist-info/METADATA +66 -0
{da4ml-0.3.1.dist-info → da4ml-0.3.3.dist-info}/RECORD +26 -24
da4ml-0.3.1.dist-info/METADATA +0 -107
{da4ml-0.3.1.dist-info → da4ml-0.3.3.dist-info}/WHEEL +0 -0
{da4ml-0.3.1.dist-info → da4ml-0.3.3.dist-info}/licenses/LICENSE +0 -0
{da4ml-0.3.1.dist-info → da4ml-0.3.3.dist-info}/top_level.txt +0 -0

da4ml/trace/fixed_variable_array.py CHANGED Viewed

@@ -86,6 +86,21 @@ class FixedVariableArray:
             assert bind.arguments.get('out', None) is None, 'Output argument is not supported'
             return einsum(eq, *operands)
+        if func in (np.dot, np.matmul):
+            assert len(args) in (2, 3), 'Dot function requires exactly two or three arguments'
+            assert len(args) == 2
+            a, b = args
+            if not isinstance(a, FixedVariableArray):
+                a = np.array(a)
+            if not isinstance(b, FixedVariableArray):
+                b = np.array(b)
+            if a.shape[-1] == b.shape[0]:
+                return a @ b
+            assert a.size == 1 or b.size == 1, f'Error in dot product: {a.shape} @ {b.shape}'
+            return a * b
         args, kwargs = to_raw_arr(args), to_raw_arr(kwargs)
         return FixedVariableArray(
             func(*args, **kwargs),
@@ -149,13 +164,33 @@ class FixedVariableArray:
         latency: NDArray[np.floating] | float = 0.0,
         solver_options: dict[str, Any] | None = None,
     ):
+        mask = k + i + f <= 0
+        k = np.where(mask, 0, k)
+        i = np.where(mask, 0, i)
+        f = np.where(mask, 0, f)
         step = 2.0**-f
         _high = 2.0**i
         high, low = _high - step, -_high * k
         return cls.from_lhs(low, high, step, hwconf, latency, solver_options)
     def __matmul__(self, other):
-        assert isinstance(other, np.ndarray)
+        if isinstance(other, FixedVariableArray):
+            other = other._vars
+        if not isinstance(other, np.ndarray):
+            other = np.array(other)
+        if any(isinstance(x, FixedVariable) for x in other.ravel()):
+            mat0, mat1 = self._vars, other
+            shape = mat0.shape[:-1] + mat1.shape[1:]
+            mat0, mat1 = mat0.reshape((-1, mat0.shape[-1])), mat1.reshape((mat1.shape[0], -1))
+            _shape = (mat0.shape[0], mat1.shape[1])
+            _vars = np.empty(_shape, dtype=object)
+            for i in range(mat0.shape[0]):
+                for j in range(mat1.shape[1]):
+                    vec0 = mat0[i]
+                    vec1 = mat1[:, j]
+                    _vars[i, j] = reduce(lambda x, y: x + y, vec0 * vec1)
+            return FixedVariableArray(_vars.reshape(shape), self.solver_options)
         kwargs = (self.solver_options or {}).copy()
         shape0, shape1 = self.shape, other.shape
         assert shape0[-1] == shape1[0], f'Matrix shapes do not match: {shape0} @ {shape1}'
@@ -180,9 +215,9 @@ class FixedVariableArray:
     def __rmatmul__(self, other):
         mat1 = np.moveaxis(other, -1, 0)
-        mat0 = np.moveaxis(self._vars, 0, -1)
+        mat0 = np.moveaxis(self, 0, -1)  # type: ignore
         ndim0, ndim1 = mat0.ndim, mat1.ndim
-        r = FixedVariableArray(mat0, self.solver_options) @ mat1
+        r = mat0 @ mat1
         _axes = tuple(range(0, ndim0 + ndim1 - 2))
         axes = _axes[ndim0 - 1 :] + _axes[: ndim0 - 1]
@@ -213,6 +248,8 @@ class FixedVariableArray:
         return FixedVariableArray(self._vars - other, self.solver_options)
     def __mul__(self, other):
+        if isinstance(other, FixedVariableArray):
+            return FixedVariableArray(self._vars * other._vars, self.solver_options)
         return FixedVariableArray(self._vars * other, self.solver_options)
     def __truediv__(self, other):
@@ -230,6 +267,11 @@ class FixedVariableArray:
         max_lat = max(v.latency for v in self._vars.ravel())
         return f'FixedVariableArray(shape={shape}, hwconf={hwconf_str}, latency={max_lat})'
+    def __pow__(self, power: int | float):
+        _power = int(power)
+        assert _power == power, 'Power must be an integer'
+        return FixedVariableArray(self._vars**_power, self.solver_options)
     def relu(self, i: NDArray[np.integer] | None = None, f: NDArray[np.integer] | None = None, round_mode: str = 'TRN'):
         shape = self._vars.shape
         i = np.broadcast_to(i, shape) if i is not None else np.full(shape, None)
@@ -241,9 +283,9 @@ class FixedVariableArray:
     def quantize(
         self,
-        k: NDArray[np.integer] | None = None,
-        i: NDArray[np.integer] | None = None,
-        f: NDArray[np.integer] | None = None,
+        k: NDArray[np.integer] | np.integer | int | None = None,
+        i: NDArray[np.integer] | np.integer | int | None = None,
+        f: NDArray[np.integer] | np.integer | int | None = None,
         overflow_mode: str = 'WRAP',
         round_mode: str = 'TRN',
     ):
@@ -276,6 +318,10 @@ class FixedVariableArray:
     def size(self):
         return self._vars.size
+    @property
+    def ndim(self):
+        return self._vars.ndim
     @property
     def kif(self):
         shape = self._vars.shape
@@ -284,7 +330,13 @@ class FixedVariableArray:
 class FixedVariableArrayInput(FixedVariableArray):
-    def __init__(self, shape: tuple[int, ...] | int, hwconf: HWConfig, solver_options: dict[str, Any] | None = None, latency=0.0):
+    def __init__(
+        self,
+        shape: tuple[int, ...] | int,
+        hwconf: HWConfig = HWConfig(1, -1, -1),
+        solver_options: dict[str, Any] | None = None,
+        latency=0.0,
+    ):
         _vars = np.empty(shape, dtype=object)
         _vars_f = _vars.ravel()
         for i in range(_vars.size):

da4ml/trace/ops/__init__.py CHANGED Viewed

@@ -35,9 +35,9 @@ def relu(x: T, i: NDArray[np.integer] | None = None, f: NDArray[np.integer] | No
 def quantize(
     x: T,
-    k: NDArray[np.integer],
-    i: NDArray[np.integer],
-    f: NDArray[np.integer],
+    k: NDArray[np.integer] | np.integer | int,
+    i: NDArray[np.integer] | np.integer | int,
+    f: NDArray[np.integer] | np.integer | int,
     overflow_mode: str = 'WRAP',
     round_mode: str = 'TRN',
 ) -> T:
@@ -47,7 +47,7 @@ def quantize(
         return x.quantize(k=k, i=i, f=f, overflow_mode=overflow_mode, round_mode=round_mode)
     else:
         x = x.copy()
-        if overflow_mode in ('SAT', 'SAT_SM'):
+        if overflow_mode in ('SAT', 'SAT_SYM'):
             step = 2.0**-f
             _high = 2.0**i
             high = _high - step

da4ml/trace/ops/einsum_utils.py CHANGED Viewed

@@ -271,6 +271,10 @@ def _einsum(fn: str, input0, input1) -> np.ndarray:
     return _exec_einsum(recipe, input0, input1)
+@overload
+def einsum(fn: str, input0: 'FixedVariableArray', input1: 'FixedVariableArray') -> 'FixedVariableArray': ...
 @overload
 def einsum(fn: str, input0: 'FixedVariableArray', input1: NDArray[np.integer | np.floating]) -> 'FixedVariableArray': ...
@@ -290,10 +294,9 @@ def einsum(fn: str, input0, input1):
     fg0 = isinstance(input0, FixedVariableArray)
     fg1 = isinstance(input1, FixedVariableArray)
-    if fg0 and fg1:
-        raise ValueError('Einsum does not support two FixedVariableArray inputs')
     r = _einsum(fn, input0, input1)
     if fg0:
         return FixedVariableArray(r, input0.solver_options)
     elif fg1:

da4ml/trace/ops/reduce_utils.py CHANGED Viewed

@@ -99,5 +99,7 @@ def reduce(operator: Callable[[T, T], T], x: TA, axis: int | Sequence[int] | Non
     r = _arr.reshape(target_shape)  # type: ignore
     if isinstance(x, FixedVariableArray):
-        return FixedVariableArray(r, solver_config)
-    return r
+        r = FixedVariableArray(r, solver_config)
+        if r.size == 1 and not keepdims:
+            return r.ravel()[0]  # type: ignore
+    return r if r.size > 1 or keepdims else r.ravel()[0]  # type: ignore

da4ml/trace/pipeline.py CHANGED Viewed

@@ -38,7 +38,7 @@ def _get_new_idx(
     out_idxd: dict[int, list[int]],
     ops: list[Op],
     stage: int,
-    latency_cutoff: int,
+    latency_cutoff: float,
 ):
     if idx < 0:
         return idx
@@ -60,7 +60,7 @@ def _get_new_idx(
     return p0_idx
-def to_pipeline(sol: Solution, latency_cutoff: int, retiming=True, verbose=True) -> CascadedSolution:
+def to_pipeline(sol: Solution, latency_cutoff: float, retiming=True, verbose=True) -> CascadedSolution:
     """Split the record into multiple stages based on the latency of the operations.
     Only useful for HDL generation.
@@ -68,7 +68,7 @@ def to_pipeline(sol: Solution, latency_cutoff: int, retiming=True, verbose=True)
     ----------
     sol : Solution
         The solution to be split into multiple stages.
-    latency_cutoff : int
+    latency_cutoff : float
         The latency cutoff for splitting the operations.
     retiming : bool
         Whether to retime the solution after splitting. Default is True.
@@ -126,10 +126,10 @@ def to_pipeline(sol: Solution, latency_cutoff: int, retiming=True, verbose=True)
             locator.append({stage: len(opd[stage]) - 1})
     sols = []
     max_stage = max(opd.keys())
+    n_in = sol.shape[0]
     for i, stage in enumerate(opd.keys()):
         _ops = opd[stage]
         _out_idx = out_idxd[stage]
-        n_in = sum(op.opcode == -1 for op in _ops)
         n_out = len(_out_idx)
         if i == max_stage:
@@ -150,6 +150,8 @@ def to_pipeline(sol: Solution, latency_cutoff: int, retiming=True, verbose=True)
             adder_size=sol.adder_size,
         )
         sols.append(_sol)
+        n_in = n_out
     csol = CascadedSolution(tuple(sols))
     if retiming:

da4ml/trace/tracer.py CHANGED Viewed

@@ -17,8 +17,7 @@ def _recursive_gather(v: FixedVariable, gathered: dict[UUID, FixedVariable]):
         return
     assert v._from is not None
     for _v in v._from:
-        if _v.id not in gathered:
-            _recursive_gather(_v, gathered)
+        _recursive_gather(_v, gathered)
     gathered[v.id] = v
@@ -26,13 +25,24 @@ def gather_variables(inputs: Sequence[FixedVariable], outputs: Sequence[FixedVar
     gathered = {v.id: v for v in inputs}
     for o in outputs:
         _recursive_gather(o, gathered)
     variables = list(gathered.values())
     N = len(variables)
     _index = sorted(list(range(N)), key=lambda i: variables[i].latency * N + i)
     variables = [variables[i] for i in _index]
-    index = {variables[i].id: i for i in range(N)}
+    # Remove variables with 0 refcount
+    refcount = {v.id: 0 for v in variables}
+    for v in variables:
+        if v in inputs:
+            continue
+        for _v in v._from:
+            refcount[_v.id] += 1
+    for v in outputs:
+        refcount[v.id] += 1
+    variables = [v for v in variables if refcount[v.id] > 0]
+    index = {variables[i].id: i for i in range(len(variables))}
     return variables, index
@@ -44,7 +54,7 @@ def _comb_trace(inputs: Sequence[FixedVariable], outputs: Sequence[FixedVariable
     for i, v in enumerate(variables):
         if v.id in inp_uuids and v.opr != 'const':
             id0 = inp_uuids[v.id]
-            ops.append(Op(id0, -1, -1, 0, v.unscaled.qint, v.latency, v.cost))
+            ops.append(Op(id0, -1, -1, 0, v.unscaled.qint, v.latency, 0.0))
             continue
         if v.opr == 'new':
             raise NotImplementedError('Operation "new" is only expected in the input list')
@@ -56,7 +66,7 @@ def _comb_trace(inputs: Sequence[FixedVariable], outputs: Sequence[FixedVariable
                 sub = int(f1 < 0)
                 data = int(log2(abs(f1 / f0)))
                 assert id0 < i and id1 < i, f'{id0} {id1} {i} {v.id}'
-                ops.append(Op(id0, id1, sub, data, v.unscaled.qint, v.latency, v.cost))
+                op = Op(id0, id1, sub, data, v.unscaled.qint, v.latency, v.cost)
             case 'cadd':
                 v0 = v._from[0]
                 f0 = v0._factor
@@ -65,19 +75,19 @@ def _comb_trace(inputs: Sequence[FixedVariable], outputs: Sequence[FixedVariable
                 qint = v.unscaled.qint
                 data = int(v._data / Decimal(qint.step))
                 assert id0 < i, f'{id0} {i} {v.id}'
-                ops.append(Op(id0, -1, 4, data, qint, v.latency, v.cost))
+                op = Op(id0, -1, 4, data, qint, v.latency, v.cost)
             case 'wrap':
                 v0 = v._from[0]
                 id0 = index[v0.id]
                 assert id0 < i, f'{id0} {i} {v.id}'
                 opcode = -3 if v._from[0]._factor < 0 else 3
-                ops.append(Op(id0, -1, opcode, 0, v.unscaled.qint, v.latency, v.cost))
+                op = Op(id0, -1, opcode, 0, v.unscaled.qint, v.latency, v.cost)
             case 'relu':
                 v0 = v._from[0]
                 id0 = index[v0.id]
                 assert id0 < i, f'{id0} {i} {v.id}'
                 opcode = -2 if v._from[0]._factor < 0 else 2
-                ops.append(Op(id0, -1, opcode, 0, v.unscaled.qint, v.latency, v.cost))
+                op = Op(id0, -1, opcode, 0, v.unscaled.qint, v.latency, v.cost)
             case 'const':
                 qint = v.unscaled.qint
                 assert qint.min == qint.max, f'const {v.id} {qint.min} {qint.max}'
@@ -85,7 +95,7 @@ def _comb_trace(inputs: Sequence[FixedVariable], outputs: Sequence[FixedVariable
                 step = 2.0**-f
                 qint = QInterval(qint.min, qint.min, step)
                 data = qint.min / step
-                ops.append(Op(-1, -1, 5, int(data), qint, v.latency, v.cost))
+                op = Op(-1, -1, 5, int(data), qint, v.latency, v.cost)
             case 'msb_mux':
                 qint = v.unscaled.qint
                 key, in0, in1 = v._from
@@ -97,10 +107,14 @@ def _comb_trace(inputs: Sequence[FixedVariable], outputs: Sequence[FixedVariable
                 assert idk < i and id0 < i and id1 < i
                 assert key._factor > 0, f'Cannot mux on v{key.id} with negative factor {key._factor}'
                 op = Op(id0, id1, opcode, data, qint, v.latency, v.cost)
-                ops.append(op)
+            case 'vmul':
+                v0, v1 = v._from
+                id0, id1 = index[v0.id], index[v1.id]
+                op = Op(id0, id1, 7, 0, v.unscaled.qint, v.latency, v.cost)
             case _:
                 raise NotImplementedError(f'Operation "{v.opr}" is not supported in tracing')
+        ops.append(op)
     out_index = [index[v.id] for v in outputs]
     return ops, out_index
@@ -147,6 +161,6 @@ def comb_trace(inputs, outputs):
     for i in range(len(ops)):
         if ref_count[i] == 0:
             op = ops[i]
-            sol.ops[i] = Op(-1, -1, op[2], 0, QInterval(0, 0, 1), op[5], op[6])
+            sol.ops[i] = Op(-1, -1, 5, 0, QInterval(0, 0, 1), op[5], 0.0)
     return sol

da4ml-0.3.3.dist-info/METADATA ADDED Viewed

@@ -0,0 +1,66 @@
+Metadata-Version: 2.4
+Name: da4ml
+Version: 0.3.3
+Summary: Digital Arithmetic for Machine Learning
+Author-email: Chang Sun <chsun@cern.ch>
+License: GNU Lesser General Public License v3 (LGPLv3)
+Project-URL: repository, https://github.com/calad0i/da4ml
+Keywords: CMVM,distributed arithmetic,hls4ml,MCM,subexpression elimination
+Classifier: Development Status :: 4 - Beta
+Classifier: License :: OSI Approved :: GNU Lesser General Public License v3 (LGPLv3)
+Classifier: Operating System :: OS Independent
+Classifier: Programming Language :: Python :: 3 :: Only
+Classifier: Programming Language :: Python :: 3.10
+Classifier: Programming Language :: Python :: 3.11
+Classifier: Programming Language :: Python :: 3.12
+Classifier: Programming Language :: Python :: 3.13
+Requires-Python: >=3.10
+Description-Content-Type: text/markdown
+License-File: LICENSE
+Requires-Dist: llvmlite>=0.44
+Requires-Dist: numba>=0.61
+Provides-Extra: docs
+Requires-Dist: hgq2; extra == "docs"
+Requires-Dist: myst-parser; extra == "docs"
+Requires-Dist: pyparsing; extra == "docs"
+Requires-Dist: sphinx; extra == "docs"
+Requires-Dist: sphinx-rtd-theme; extra == "docs"
+Dynamic: license-file
+# da4ml: Distributed Arithmetic for Machine Learning
+[![LGPLv3](https://img.shields.io/badge/License-LGPLv3-blue.svg)](https://www.gnu.org/licenses/lgpl-3.0)
+[![Documentation](https://github.com/calad0i/da4ml/actions/workflows/sphinx-build.yml/badge.svg)](https://calad0i.github.io/da4ml/)
+[![PyPI version](https://badge.fury.io/py/da4ml.svg)](https://badge.fury.io/py/da4ml)
+[![ArXiv](https://img.shields.io/badge/arXiv-2507.04535-b31b1b.svg)](https://arxiv.org/abs/2507.04535)
+da4ml is a library for implementing distributed arithmetic (DA) based algorithms for ultra-low latency machine learning (ML) applications on FPGAs. It as two major components:
+ - A fast and performant constant-matrix-vector multiplications (CMVM) optimizer to implement them as
+   efficient adder trees. Common sub-expressions elimination (CSE) with graph-based pre-optimization are
+   performed to reduce the firmware footprint and improve the performance.
+ - Low-level symbolic tracing frameworks for generating combinational/fully pipelined logics in HDL or HLS
+   code. For fully pipelined networks, da4ml can generate the firmware for the whole network standalone.
+   Alternatively, da4ml be used as a plugin in hls4ml to optimize the CMVM operations in the network.
+Key Features
+------------
+- **Optimized Algorithms**: Comparing to hls4ml's latency strategy, da4ml's CMVM implementation uses no DSO and consumes up to 50% less LUT usage.
+- **Fast code generation**: da4ml can generate HDL for a fully pipelined network in seconds. For the same models, high-level synthesis tools like Vivado/Vitis HLS can take up to days to generate the HDL code.
+- **Low-level symbolic tracing**: As long as the operation can be expressed by a combination of the low-level operations supported, adding new operations is straightforward by "replaying" the operation on the symbolic tensor provided. In most cases, adding support for a new operation/layer takes just a few lines of code in numpy flavor.
+- **Automatic model conversion**: da4ml can automatically convert models trained in `HGQ2 <https://github.com/calad0i/hgq2>`_.
+- **Bit-accurate Simulation**: All operation in da4ml is bit-accurate, meaning the generated HDL code will produce the same output as the original model. da4ml's computation is converted to a RISC-like, instruction set level intermediate representation, distributed arithmetic instruction set (DAIS), which can be easily simulated in multiple ways.
+- **hls4ml integration**: da4ml can be used as a plugin in hls4ml to optimize the CMVM operations in the network by setting `strategy='distributed_arithmetic'` for the strategy of the Dense, EinsumDense, or Conv1/2D layers.
+Installation
+------------
+```bash
+pip install da4ml
+```
+Getting Started
+---------------
+See the [Getting Started](https://calad0i.github.io/da4ml/getting_started.html) guide for a quick introduction to using da4ml.

{da4ml-0.3.1.dist-info → da4ml-0.3.3.dist-info}/RECORD RENAMED Viewed

@@ -1,8 +1,8 @@
 da4ml/__init__.py,sha256=IETRRvzsJvPMLu1kzzi8UN5FYaM5MhNaXH2A_ZKr2_w,469
-da4ml/_version.py,sha256=lOWWIGJeBi0KkFopWU_n3GH71C1PsaZ-ZYDfxFkne6c,511
+da4ml/_version.py,sha256=lemL_4Kl75FgrO6lVuFrrtw6-Dcf9wtXBalKkXuzkO4,704
 da4ml/cmvm/__init__.py,sha256=4Tbt913k9zP0w8R1p6Oss06v5jrManbUhskyHl6e-U0,154
 da4ml/cmvm/api.py,sha256=JpecMt6g8zutGh_uWT61_0iX8TuXct7-jq7N7HMIsgA,9626
-da4ml/cmvm/types.py,sha256=hdthYdP5muIQ-9qFE0CjObGT7lCxB1-udXU16LxtuBI,20959
+da4ml/cmvm/types.py,sha256=O8BuBZ2SyucxoXt_KbulAuHNgim7Ls3M6Ovw8prLgXM,21340
 da4ml/cmvm/core/__init__.py,sha256=bp2CXI4EOVOQSho1qwfusNs0RliZRt2dV0hZ33W_Kjo,7703
 da4ml/cmvm/core/indexers.py,sha256=QjXgvExS-B2abHTJPDG4NufMdMEflo1i6cUhFOgJpH4,2945
 da4ml/cmvm/core/state_opr.py,sha256=wLqO8qVuM2-qCE5LDeYJDNkUruIPHy63obsv4-x-aR8,8661
@@ -11,11 +11,11 @@ da4ml/cmvm/util/bit_decompose.py,sha256=SUco70HRYf4r1JU6BXwcgabDrhm_yAmucae5FC67
 da4ml/cmvm/util/mat_decompose.py,sha256=eSJNlXwx_jxgqt5vLJrSLQaeq2ZXu8j9mC4d-eq883M,4094
 da4ml/codegen/__init__.py,sha256=Chdh3oO_vLR4saLbT9VxBPz_0wlEzxJldFSZaVUJo7U,331
 da4ml/codegen/cpp/__init__.py,sha256=SIePoi_T4iJph50OQUosAnaVuLCckukYjLxp91Y8xQs,134
-da4ml/codegen/cpp/cpp_codegen.py,sha256=6lBF1I-xXdIABEWF60owBmQiISuI6mrITCqLqhsEHrQ,6033
+da4ml/codegen/cpp/cpp_codegen.py,sha256=I3YcxK524_oJ7jebxOlRGuYbN2uCY5mpKACoQShqZxs,6153
 da4ml/codegen/cpp/hls_model.py,sha256=J5lnB8sAvMy0Bo5MSJOpgyUm1tzEJqBxgPTlOd38Gbg,8978
-da4ml/codegen/cpp/source/binder_util.hh,sha256=pBVmhXIDvdCr8n2wwYehc3Fpp60sWYrrZaDoP3x9JZE,1880
+da4ml/codegen/cpp/source/binder_util.hh,sha256=ClECVxcEynE_9i4jWCV4y1dnadG3wFqLZfjxg4qHFQQ,1752
 da4ml/codegen/cpp/source/build_binder.mk,sha256=RLu4TP28aJsveyMOHxuDRGEJVoIPMo9T8WyPtqnmtbQ,584
-da4ml/codegen/cpp/source/vitis_bitshift.hh,sha256=yFpYCVJ8gof-EzPjkIWWZYmdFh_wk133Pxzs7f61IQo,774
+da4ml/codegen/cpp/source/vitis_bitshift.hh,sha256=u8wjT_cRn7bXcbC5pH3-rS76ekRbwv-VWAAdaP52-dw,765
 da4ml/codegen/cpp/source/ap_types/ap_binary.h,sha256=yOcafu2IofstDqxn0wDq8vY3JIwZQ9H5z6IY1dEqMr0,2764
 da4ml/codegen/cpp/source/ap_types/ap_common.h,sha256=1hJY9uvKOdwRSSll5uehUISZR4tsSsQ1z4PNRUc44KU,10180
 da4ml/codegen/cpp/source/ap_types/ap_decl.h,sha256=z1HsH-2RSvSoofTZR7RHeqIfAnEYVuHcIu_ute9gjEg,6473
@@ -33,32 +33,34 @@ da4ml/codegen/cpp/source/ap_types/hls_stream.h,sha256=NTkVfbE48c6XnMIfR9WzJbDwUn
 da4ml/codegen/cpp/source/ap_types/etc/ap_private.h,sha256=TDdxGIX0r3D6Ql8KeXoceRmHhdlwFA3Akr3-vvMVAtk,261465
 da4ml/codegen/cpp/source/ap_types/utils/x_hls_utils.h,sha256=x24cf1HyZKv0J8YQIoUvYE3uw6SNL7vWetRGIiFm2Jw,2227
 da4ml/codegen/verilog/__init__.py,sha256=rXmW2V9sDp2RYMDAWlhj_gfMXH3G5lPNmLrFtsJjn_A,298
-da4ml/codegen/verilog/comb.py,sha256=CmCwiddeiT4TCZV088lF2ENlAXx3vjZKszTz1sYXEao,7614
+da4ml/codegen/verilog/comb.py,sha256=AnrfJxJXe3hytXiX00VGbdW91AAJDF-dLdsSSWBivdU,7961
 da4ml/codegen/verilog/io_wrapper.py,sha256=SSs-ZRhBVLR6tpFso8GNGk-FH6JDe-p7LPvVPjTspxo,5002
 da4ml/codegen/verilog/pipeline.py,sha256=YsPRTLp04Aofg33QMw6_ga3fNX9LeCD7Pq2PnERLWOg,2377
-da4ml/codegen/verilog/verilog_model.py,sha256=3ZFaHqx1ONX3uxDKsbzLPxy3D7dehveRmdBfBiiS64o,12299
-da4ml/codegen/verilog/source/binder_util.hh,sha256=Dn9ysUdonw0HR8bxom8YfQF7vc1LEvT_B1V_o8Gw1rY,2503
+da4ml/codegen/verilog/verilog_model.py,sha256=2uyrpQN_f1cdF5fz0fBR5nh6idHlzhh_JneLkJAruQs,12172
+da4ml/codegen/verilog/source/binder_util.hh,sha256=2sab9M0vYBsaimzJ8tWJ9LsxYKMe3xTqdFSGO7YRPbk,2521
 da4ml/codegen/verilog/source/build_binder.mk,sha256=rQbI98itE_b1wIQ_0uCXfBzNmGK2XT4vWmRyCJNnPKk,960
 da4ml/codegen/verilog/source/build_prj.tcl,sha256=JA-zLl7fd2PV-BFaX22-MTex04QTi0urWUXNAEUDTy0,3003
-da4ml/codegen/verilog/source/ioutil.hh,sha256=1o1-oIyQyYc9CU91bBxuitVzzcrNT8p4MTarFKiJoG4,3967
+da4ml/codegen/verilog/source/ioutil.hh,sha256=QXiHbOfkprOL6b-gBQGwcEOQ39uO-bRxKxwObluiK44,3967
+da4ml/codegen/verilog/source/multiplier.v,sha256=MfgRYi7jYPp4W94KLKWpc2MPu2Dg9CDiQ3lJizSIlIQ,1122
 da4ml/codegen/verilog/source/mux.v,sha256=1PMSQKGR_Cku1EQnePBVCuX6we_dqYBXW54WBEURvs0,1928
 da4ml/codegen/verilog/source/negative.v,sha256=YphTCLnYslktsnCPq1xjbYgIFavani5NBbqs20uwhBI,688
 da4ml/codegen/verilog/source/shift_adder.v,sha256=qrpXBX9bhHI-o75v5zshOfq0giEATvbeGgTir20_S3Q,1915
 da4ml/codegen/verilog/source/template.xdc,sha256=GlSRy8tw_orohSuUwUSNEYJLLkAAHttGTfLTcQqRQDg,1262
-da4ml/converter/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
-da4ml/converter/hgq2/parser.py,sha256=bAtnEXQxRKU9a1HFJWTy-e_HDzZY_wXOBVdyYG3ndsM,3826
-da4ml/converter/hgq2/replica.py,sha256=VzasMOasU73ZQr0xH2e8uOeKVFX_Is0n9aZFscGKcik,13864
+da4ml/converter/__init__.py,sha256=x7J2PEXYZsVWffRAkucLxbwzzU404eaijMdLwdhBxtY,57
+da4ml/converter/hgq2/__init__.py,sha256=-gnT_7zXY-KQtPLxsqngwDKZ2TUIynn996pUjjB03B8,59
+da4ml/converter/hgq2/parser.py,sha256=Yc5V-B_aEslqIXXJihRi3GMjF9vMkmUQ2_yHMGHMPVo,5573
+da4ml/converter/hgq2/replica.py,sha256=aKi6BF2x4s3VUF1Q-__GE4-is9eSC3H8TGFDT05vTWc,16292
 da4ml/trace/__init__.py,sha256=dv-rti3t8iE0RqeThfOb40mAg8FZB2WkkGQq3enJft0,282
-da4ml/trace/fixed_variable.py,sha256=6dfMHBN1NfqYIbPZ79GCPCXj2JFQUKTyDZu6xDaG3rg,17082
-da4ml/trace/fixed_variable_array.py,sha256=A0ApTvZxpkr7kHrUQkyhrGJuuPe4kDgLFyD_1CW7lBk,10985
-da4ml/trace/pipeline.py,sha256=_R2uqWgnpuQ4tD7VKz2eu8CF9Air2RtYH2o03Vfg0Mk,5353
-da4ml/trace/tracer.py,sha256=NqPEH9hyVlGQOf9_kJL3A7SujCcxkT-z28bk0Ael5jE,5664
-da4ml/trace/ops/__init__.py,sha256=I4VqB43lVkFlLtkoWxiSDHBFGvxKwutNbAJw5aLVeAI,2108
+da4ml/trace/fixed_variable.py,sha256=7vaXFZToCVzPtUZcHv4aoqpqJp46SHUzSWTQijVT0os,21101
+da4ml/trace/fixed_variable_array.py,sha256=mJj9aU-jLCPVkFXrTbcRQndtUKEuhVwiFUGVSGX7PHE,12975
+da4ml/trace/pipeline.py,sha256=AVeO9BNpQlo_WO6S1nQl7RxiHs5VFRR10tWMg_36C2o,5354
+da4ml/trace/tracer.py,sha256=xnaVO4oTWwasfiEBqqeY9o60Lek3eX65IIbvB7JtVKQ,6099
+da4ml/trace/ops/__init__.py,sha256=fz5Cg7ZQqPkZlUj4bIOKY6aaoA1fX_G22TeA8I1n4qY,2166
 da4ml/trace/ops/conv_utils.py,sha256=Yn73t4F6Tcs1hBwK08L1DPOin2HYVcng4PSkU4vuZFo,8245
-da4ml/trace/ops/einsum_utils.py,sha256=MoWvOfvtVjXGwqEhXEzZ3uGrgSmLTHngV8I1eLyANGE,11433
-da4ml/trace/ops/reduce_utils.py,sha256=8gohGQRVr8Bn5rfyrGsnE8EDxUXAObv521qu4mJrX9I,3348
-da4ml-0.3.1.dist-info/licenses/LICENSE,sha256=46mU2C5kSwOnkqkw9XQAJlhBL2JAf1_uCD8lVcXyMRg,7652
-da4ml-0.3.1.dist-info/METADATA,sha256=3H1yt5sKqrIncAGok6NqE27O66_yD7hPUN6jFmCdMqQ,4569
-da4ml-0.3.1.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
-da4ml-0.3.1.dist-info/top_level.txt,sha256=N0tnKVwRqFiffFdeAzCgFq71hUNySh5-ITbNd6-R58Q,6
-da4ml-0.3.1.dist-info/RECORD,,
+da4ml/trace/ops/einsum_utils.py,sha256=ODofbvR98FwKBTDZsJ0ObbMjU9_GjPu5AbGuWX6sdCY,11453
+da4ml/trace/ops/reduce_utils.py,sha256=vQjEUUbvnW8inAYJWHDzgy-PbgwIdHlH-uzPzSEvrSc,3494
+da4ml-0.3.3.dist-info/licenses/LICENSE,sha256=46mU2C5kSwOnkqkw9XQAJlhBL2JAf1_uCD8lVcXyMRg,7652
+da4ml-0.3.3.dist-info/METADATA,sha256=C3NAvObpQ5xNOmQQ-cE77AJMFevKJ0gCCO-BrlQpAeA,4055
+da4ml-0.3.3.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
+da4ml-0.3.3.dist-info/top_level.txt,sha256=N0tnKVwRqFiffFdeAzCgFq71hUNySh5-ITbNd6-R58Q,6
+da4ml-0.3.3.dist-info/RECORD,,

da4ml-0.3.1.dist-info/METADATA DELETED Viewed

@@ -1,107 +0,0 @@
-Metadata-Version: 2.4
-Name: da4ml
-Version: 0.3.1
-Summary: Digital Arithmetic for Machine Learning
-Author-email: Chang Sun <chsun@cern.ch>
-License: GNU Lesser General Public License v3 (LGPLv3)
-Project-URL: repository, https://github.com/calad0i/da4ml
-Keywords: CMVM,distributed arithmetic,hls4ml,MCM,subexpression elimination
-Classifier: Development Status :: 4 - Beta
-Classifier: License :: OSI Approved :: GNU Lesser General Public License v3 (LGPLv3)
-Classifier: Operating System :: OS Independent
-Classifier: Programming Language :: Python :: 3 :: Only
-Classifier: Programming Language :: Python :: 3.10
-Classifier: Programming Language :: Python :: 3.11
-Classifier: Programming Language :: Python :: 3.12
-Classifier: Programming Language :: Python :: 3.13
-Requires-Python: >=3.10
-Description-Content-Type: text/markdown
-License-File: LICENSE
-Requires-Dist: llvmlite>=0.44
-Requires-Dist: numba>=0.61
-Dynamic: license-file
-# da4ml: Distributed Arithmetic for Machine Learning
-This project performs Constant Matrix-Vector Multiplication (CMVM) with Distributed Arithmetic (DA) for Machine Learning (ML) on a Field Programmable Gate Arrays (FPGAs).
-CMVM optimization is done through greedy CSE of two-term subexpressions, with possible Delay Constraints (DC). The optimization is done in jitted Python (Numba), and a list of optimized operations is generated as traced Python code.
-The project generates Verilog or Vitis HLS code for the optimized CMVM operations. This project can be used in conjunction with [`hls4ml`](https://github.com/fastmachinelearning/hls4ml/) for optimizing the neural networks deployed on FPGAs. For a subset of neural networks, the full design can be generated standalone in Verilog or Vitis HLS.
-## Installation
-The project is available on PyPI and can be installed with pip:
-```bash
-pip install da4ml
-```
-Notice that `numba>=6.0.0` is required for the project to work. The project does not work with `python<3.10`. If the project fails to compile, try upgrading `numba` and `llvmlite` to the latest versions.
-## `hls4ml`
-The major use of this project is through the `distributed_arithmetic` strategy in the `hls4ml`:
-```python
-model_hls = hls4ml.converters.convert_from_keras_model(
-    model,
-    hls_config={
-        'Model': {
-            ...
-            'Strategy': 'distributed_arithmetic',
-        },
-        ...
-    },
-    ...
-)
-```
-Currently, `Dense/Conv1D/Conv2D` layers are supported for both `io_parallel` and `io_stream` dataflows. However, notice that distributed arithmetic implies `reuse_factor=1`, as the whole kernel is implemented in combinational logic.
-## Standalone usage
-### `HGQ2`
-For some models trained with `HGQ2`, the `da4ml` can be used to generate the whole model in Verilog or Vitis HLS:
-```python
-from da4ml.codegen import HLSModel, VerilogModel
-from da4ml.converter.hgq2.parser import trace_model
-from da4ml.trace import comb_trace
-inp, out = trace_model(hgq2_model)
-comb_logic = comb_trace(inp[0], out[0]) # Currently, only models with 1 input and 1 output are supported
-# Pipelined Verilog model generation
-# `latency_cutoff` is used to control auto piplining behavior. To disable pipelining, set it to -1.
-verilog_model = VerilogModel(sol, prj_name='barbar', path='/tmp/barbar', latency_cutoff=5)
-verilog_model.compile() # write and verilator binding
-verilog_model.predict(inputs)
-vitis_hls_model = HLSModel(sol, prj_name='foo', path='/tmp/foo', flavor='vitis') # Only vitis is supported for now
-vitis_hls_model.compile() # write and hls binding
-vitis_hls_model.predict(inputs)
-```
-### Functional Definition
-For generic operations, one can define a combinational logic with the functional API:
-```python
-from da4ml.trace import FixedVariableArray, HWConfig, comb_trace
-from da4ml.trace.ops import einsum, relu, quantize, conv, pool
-# k, i, f are numpy arrays of integers: keep_negative (0/1), integer bits (excl. sign), fractional bits
-inp = FixedVariableArray.from_kif(k, i, f, HWConfig(1, -1, -1), solver_options={'hard_dc':2})
-out = inp @ kernel
-out = relu(out)
-out = einsum(equation, out, weights)
-...
-comb = comb_trace(inp, out)
-```
-`+`, `-`, `@` are supported as well as `einsum`, `relu`, `quantize` (WRAP, with TRN or RND), `conv`, `pool` (average only). For multiplications, only power-of-two multipliers are supported, otherwise use `einsum` or `@` operators.
-The `comb_trace` returns a `Solution` objects that contains a list of low-level operations that are used to implement the combinational logic, which in turn can be used to generate Verilog or Vitis HLS code.

{da4ml-0.3.1.dist-info → da4ml-0.3.3.dist-info}/WHEEL RENAMED Viewed

File without changes

{da4ml-0.3.1.dist-info → da4ml-0.3.3.dist-info}/licenses/LICENSE RENAMED Viewed

File without changes

{da4ml-0.3.1.dist-info → da4ml-0.3.3.dist-info}/top_level.txt RENAMED Viewed

File without changes

da4ml 0.3.1__py3-none-any.whl → 0.3.3__py3-none-any.whl

Potentially problematic release.

da4ml 0.3.1py3-none-any.whl → 0.3.3py3-none-any.whl