sparsepixels 0.2.3__tar.gz → 0.3.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {sparsepixels-0.2.3/sparsepixels.egg-info → sparsepixels-0.3.0}/PKG-INFO +63 -13
- sparsepixels-0.3.0/README.md +169 -0
- {sparsepixels-0.2.3 → sparsepixels-0.3.0}/setup.cfg +2 -1
- sparsepixels-0.3.0/sparsepixels/layers.py +325 -0
- sparsepixels-0.3.0/sparsepixels/utils.py +488 -0
- {sparsepixels-0.2.3 → sparsepixels-0.3.0/sparsepixels.egg-info}/PKG-INFO +63 -13
- {sparsepixels-0.2.3 → sparsepixels-0.3.0}/sparsepixels.egg-info/SOURCES.txt +1 -0
- {sparsepixels-0.2.3 → sparsepixels-0.3.0}/sparsepixels.egg-info/requires.txt +1 -0
- sparsepixels-0.3.0/tests/test_model.py +93 -0
- sparsepixels-0.2.3/README.md +0 -120
- sparsepixels-0.2.3/sparsepixels/layers.py +0 -174
- sparsepixels-0.2.3/tests/test_model.py +0 -57
- {sparsepixels-0.2.3 → sparsepixels-0.3.0}/LICENSE +0 -0
- {sparsepixels-0.2.3 → sparsepixels-0.3.0}/notebook/utils.py +0 -0
- {sparsepixels-0.2.3 → sparsepixels-0.3.0}/pyproject.toml +0 -0
- {sparsepixels-0.2.3 → sparsepixels-0.3.0}/setup.py +0 -0
- {sparsepixels-0.2.3 → sparsepixels-0.3.0}/sparsepixels/__init__.py +0 -0
- {sparsepixels-0.2.3 → sparsepixels-0.3.0}/sparsepixels/img/logo.png +0 -0
- {sparsepixels-0.2.3 → sparsepixels-0.3.0}/sparsepixels.egg-info/dependency_links.txt +0 -0
- {sparsepixels-0.2.3 → sparsepixels-0.3.0}/sparsepixels.egg-info/top_level.txt +0 -0
- {sparsepixels-0.2.3 → sparsepixels-0.3.0}/tests/__init__.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: sparsepixels
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.3.0
|
|
4
4
|
Summary: Efficient convolution for sparse data on FPGAs
|
|
5
5
|
Home-page: https://github.com/hftsoi/sparse-pixels
|
|
6
6
|
Author: Ho Fung Tsoi
|
|
@@ -14,6 +14,7 @@ License-File: LICENSE
|
|
|
14
14
|
Requires-Dist: tensorflow
|
|
15
15
|
Requires-Dist: keras>=3.0
|
|
16
16
|
Requires-Dist: HGQ2>=0.1.8
|
|
17
|
+
Requires-Dist: matplotlib
|
|
17
18
|
Dynamic: license-file
|
|
18
19
|
|
|
19
20
|
<p align="center">
|
|
@@ -34,7 +35,7 @@ Dynamic: license-file
|
|
|
34
35
|
[](https://arxiv.org/abs/2512.06208)
|
|
35
36
|
[](https://pypi.org/project/sparsepixels)
|
|
36
37
|
|
|
37
|
-
|
|
38
|
+
SparsePixels is a Keras 3 library to build, train, and deploy sparse convolutional neural networks on FPGAs. In many detectors, especially in high-energy physics experiments, the images are almost empty: only a handful of pixels carry a signal (the hits), yet a standard CNN still spends compute on every pixel. A sparse CNN convolves only over the active pixels, so its cost scales with the number of hits rather than the image size, which is what makes low-latency, real-time inference (for example in a trigger) feasible on an FPGA. This library builds quantization-aware (via [HGQ2](https://github.com/calad0i/HGQ2)) sparse CNNs in which the pixel budget and the activity threshold can be learned from data, with a hardware-aware penalty that drives the budget toward the fewest pixels the task tolerates. Trained models convert to FPGA firmware through the [hls4ml](https://github.com/fastmachinelearning/hls4ml) integration, with control over the parallelization of the sparse layers to trade latency against resource usage.
|
|
38
39
|
|
|
39
40
|
## Installation
|
|
40
41
|
|
|
@@ -46,7 +47,7 @@ pip install sparsepixels
|
|
|
46
47
|
|
|
47
48
|
## Getting Started
|
|
48
49
|
|
|
49
|
-
Import sparse layers
|
|
50
|
+
Import the sparse layers, the quantization library (HGQ2), and the training utilities:
|
|
50
51
|
|
|
51
52
|
```python
|
|
52
53
|
import keras
|
|
@@ -54,12 +55,28 @@ from keras.layers import Flatten, Activation
|
|
|
54
55
|
from hgq.layers import QConv2D, QDense
|
|
55
56
|
from hgq.config import QuantizerConfigScope, LayerConfigScope
|
|
56
57
|
from hgq.quantizer.config import QuantizerConfig
|
|
57
|
-
from sparsepixels.layers import InputReduce, QConv2DSparse, AveragePooling2DSparse
|
|
58
|
+
from sparsepixels.layers import InputReduce, QConv2DSparse, AveragePooling2DSparse, MaxPooling2DSparse
|
|
59
|
+
from sparsepixels.utils import (
|
|
60
|
+
active_pixels_vs_threshold, plot_reduced_examples,
|
|
61
|
+
set_sparse_ebops_factor, cosine_lr,
|
|
62
|
+
SparseTrainingMonitor, plot_history,
|
|
63
|
+
print_quantization, plot_quantization,
|
|
64
|
+
)
|
|
65
|
+
```
|
|
66
|
+
|
|
67
|
+
First, study the data to pick a threshold and an initial pixel budget `n`: how many pixels stay
|
|
68
|
+
active as the threshold rises, and what a candidate `(n, threshold)` keeps on a few images.
|
|
69
|
+
|
|
70
|
+
```python
|
|
71
|
+
active_pixels_vs_threshold(x_train)
|
|
72
|
+
plot_reduced_examples(x_train, n=20, threshold=0.1, n_examples=4)
|
|
58
73
|
```
|
|
59
74
|
|
|
60
|
-
Build an example sparse CNN within HGQ2 quantization scopes. A custom input quantizer
|
|
61
|
-
|
|
62
|
-
|
|
75
|
+
Build an example sparse CNN within HGQ2 quantization scopes. A custom input quantizer config with
|
|
76
|
+
higher initial fractional bits (`f0=8`) prevents the default (`f0=2`) from zeroing out sparse signals
|
|
77
|
+
in early training epochs. `InputReduce` keeps the first `n` active pixels (first channel above
|
|
78
|
+
`threshold`); by default `n` and `threshold` are trainable hyperparameters, and a penalty of weight `beta_n`
|
|
79
|
+
nudges the budget smaller, trading a little accuracy for lower FPGA latency and resources.
|
|
63
80
|
|
|
64
81
|
```python
|
|
65
82
|
iq_conf = QuantizerConfig(place='datalane', q_type='kif', i0=4, f0=8, overflow_mode='WRAP')
|
|
@@ -71,8 +88,15 @@ with (
|
|
|
71
88
|
):
|
|
72
89
|
x_in = keras.Input(shape=(28, 28, 1), name='x_in')
|
|
73
90
|
|
|
74
|
-
# Sparse input reduction
|
|
75
|
-
x, keep_mask = InputReduce(
|
|
91
|
+
# Sparse input reduction
|
|
92
|
+
x, keep_mask = InputReduce(
|
|
93
|
+
n=30, # initial pixel budget
|
|
94
|
+
threshold=0.1, # initial activity threshold
|
|
95
|
+
beta_n=1e-5, # weight of the pixel budget penalty
|
|
96
|
+
learn_n=True, # trainable pixel budget
|
|
97
|
+
learn_threshold=True, # trainable threshold
|
|
98
|
+
name='input_reduce',
|
|
99
|
+
)(x_in)
|
|
76
100
|
|
|
77
101
|
# Sparse convolution
|
|
78
102
|
x = QConv2DSparse(filters=3, kernel_size=3, name='conv1', padding='same', strides=1,
|
|
@@ -88,6 +112,33 @@ with (
|
|
|
88
112
|
model = keras.Model(x_in, x)
|
|
89
113
|
```
|
|
90
114
|
|
|
115
|
+
Train the model, then read out the learned sparsity to deploy. `set_sparse_ebops_factor` makes the
|
|
116
|
+
EBOPS (a proxy for the quantized hardware cost) reflect the sparse compute rather than a dense one; a
|
|
117
|
+
cosine-decayed learning rate together with `restore_best_weights` keeps the learned budget from
|
|
118
|
+
over-compressing near the end of training. `plot_history` shows the loss breakdown, the learned
|
|
119
|
+
budget/threshold and the EBOPS in one figure, and the values to deploy are `layer.n_max_pixels` and
|
|
120
|
+
`layer.threshold`.
|
|
121
|
+
|
|
122
|
+
```python
|
|
123
|
+
set_sparse_ebops_factor(model)
|
|
124
|
+
|
|
125
|
+
steps_per_epoch = len(x_train) // 128
|
|
126
|
+
early_stop = keras.callbacks.EarlyStopping(monitor='val_accuracy', mode='max', patience=20, restore_best_weights=True)
|
|
127
|
+
model.compile(
|
|
128
|
+
optimizer=keras.optimizers.Adam(cosine_lr(1e-3, epochs=100, steps_per_epoch=steps_per_epoch)),
|
|
129
|
+
loss='categorical_crossentropy', metrics=['accuracy'],
|
|
130
|
+
)
|
|
131
|
+
history = model.fit(x_train, y_train, validation_data=(x_val, y_val),
|
|
132
|
+
epochs=100, batch_size=128, callbacks=[early_stop, SparseTrainingMonitor()])
|
|
133
|
+
|
|
134
|
+
plot_history(history, early_stopping=early_stop) # loss breakdown, budget, threshold, EBOPS
|
|
135
|
+
print_quantization(model) # per-layer bit-width distribution and EBOPS
|
|
136
|
+
plot_quantization(model)
|
|
137
|
+
|
|
138
|
+
ir = model.get_layer('input_reduce')
|
|
139
|
+
print(f"deploy with n_max_pixels={ir.n_max_pixels}, threshold={ir.threshold:.3f}")
|
|
140
|
+
```
|
|
141
|
+
|
|
91
142
|
## Converting a trained model to HLS with hls4ml
|
|
92
143
|
|
|
93
144
|
> **Note:** A [PR](https://github.com/fastmachinelearning/hls4ml/pull/1468) adding `sparsepixels` support to the official [hls4ml](https://github.com/fastmachinelearning/hls4ml) repo has been submitted but is not yet merged. In the meantime you can install hls4ml from the PR branch on this fork to try the converter:
|
|
@@ -109,17 +160,17 @@ hls_model = hls4ml.converters.convert_from_keras_model(
|
|
|
109
160
|
hls_config=hls_config,
|
|
110
161
|
output_dir='hls_proj/my_sparse_cnn',
|
|
111
162
|
backend='Vitis',
|
|
112
|
-
io_type='io_parallel',
|
|
163
|
+
io_type='io_parallel',
|
|
113
164
|
)
|
|
114
165
|
hls_model.write()
|
|
115
166
|
hls_model.compile()
|
|
116
167
|
y_hls = hls_model.predict(x_test)
|
|
117
168
|
```
|
|
118
169
|
|
|
119
|
-
> **Note:** The converter currently supports only fully parallelized `io_parallel` HLS. We are working on expanding to partial parallelization and `io_stream` for larger flexibility.
|
|
120
|
-
|
|
121
170
|
## Documentation
|
|
122
171
|
|
|
172
|
+
Coming soon!
|
|
173
|
+
|
|
123
174
|
## Citation
|
|
124
175
|
|
|
125
176
|
If you find this useful in your research, please consider citing:
|
|
@@ -135,4 +186,3 @@ If you find this useful in your research, please consider citing:
|
|
|
135
186
|
year = "2025"
|
|
136
187
|
}
|
|
137
188
|
```
|
|
138
|
-
|
|
@@ -0,0 +1,169 @@
|
|
|
1
|
+
<p align="center">
|
|
2
|
+
<img src="https://raw.githubusercontent.com/hftsoi/sparse-pixels/main/docs/figs/logo.png" width="300" />
|
|
3
|
+
</p>
|
|
4
|
+
|
|
5
|
+
<p align="center">
|
|
6
|
+
<img src="https://raw.githubusercontent.com/hftsoi/sparse-pixels/main/docs/figs/sparsepixels.png" width="900"/>
|
|
7
|
+
</p>
|
|
8
|
+
|
|
9
|
+
<p align="center">
|
|
10
|
+
<img src="https://raw.githubusercontent.com/hftsoi/sparse-pixels/main/docs/figs/cnn_standard.gif" width="400" />
|
|
11
|
+
<img src="https://raw.githubusercontent.com/hftsoi/sparse-pixels/main/docs/figs/cnn_sparse.gif" width="400" />
|
|
12
|
+
</p>
|
|
13
|
+
|
|
14
|
+
# SparsePixels: Efficient convolution for sparse data on FPGAs
|
|
15
|
+
|
|
16
|
+
[](https://arxiv.org/abs/2512.06208)
|
|
17
|
+
[](https://pypi.org/project/sparsepixels)
|
|
18
|
+
|
|
19
|
+
SparsePixels is a Keras 3 library to build, train, and deploy sparse convolutional neural networks on FPGAs. In many detectors, especially in high-energy physics experiments, the images are almost empty: only a handful of pixels carry a signal (the hits), yet a standard CNN still spends compute on every pixel. A sparse CNN convolves only over the active pixels, so its cost scales with the number of hits rather than the image size, which is what makes low-latency, real-time inference (for example in a trigger) feasible on an FPGA. This library builds quantization-aware (via [HGQ2](https://github.com/calad0i/HGQ2)) sparse CNNs in which the pixel budget and the activity threshold can be learned from data, with a hardware-aware penalty that drives the budget toward the fewest pixels the task tolerates. Trained models convert to FPGA firmware through the [hls4ml](https://github.com/fastmachinelearning/hls4ml) integration, with control over the parallelization of the sparse layers to trade latency against resource usage.
|
|
20
|
+
|
|
21
|
+
## Installation
|
|
22
|
+
|
|
23
|
+
With Python >= 3.10:
|
|
24
|
+
|
|
25
|
+
```
|
|
26
|
+
pip install sparsepixels
|
|
27
|
+
```
|
|
28
|
+
|
|
29
|
+
## Getting Started
|
|
30
|
+
|
|
31
|
+
Import the sparse layers, the quantization library (HGQ2), and the training utilities:
|
|
32
|
+
|
|
33
|
+
```python
|
|
34
|
+
import keras
|
|
35
|
+
from keras.layers import Flatten, Activation
|
|
36
|
+
from hgq.layers import QConv2D, QDense
|
|
37
|
+
from hgq.config import QuantizerConfigScope, LayerConfigScope
|
|
38
|
+
from hgq.quantizer.config import QuantizerConfig
|
|
39
|
+
from sparsepixels.layers import InputReduce, QConv2DSparse, AveragePooling2DSparse, MaxPooling2DSparse
|
|
40
|
+
from sparsepixels.utils import (
|
|
41
|
+
active_pixels_vs_threshold, plot_reduced_examples,
|
|
42
|
+
set_sparse_ebops_factor, cosine_lr,
|
|
43
|
+
SparseTrainingMonitor, plot_history,
|
|
44
|
+
print_quantization, plot_quantization,
|
|
45
|
+
)
|
|
46
|
+
```
|
|
47
|
+
|
|
48
|
+
First, study the data to pick a threshold and an initial pixel budget `n`: how many pixels stay
|
|
49
|
+
active as the threshold rises, and what a candidate `(n, threshold)` keeps on a few images.
|
|
50
|
+
|
|
51
|
+
```python
|
|
52
|
+
active_pixels_vs_threshold(x_train)
|
|
53
|
+
plot_reduced_examples(x_train, n=20, threshold=0.1, n_examples=4)
|
|
54
|
+
```
|
|
55
|
+
|
|
56
|
+
Build an example sparse CNN within HGQ2 quantization scopes. A custom input quantizer config with
|
|
57
|
+
higher initial fractional bits (`f0=8`) prevents the default (`f0=2`) from zeroing out sparse signals
|
|
58
|
+
in early training epochs. `InputReduce` keeps the first `n` active pixels (first channel above
|
|
59
|
+
`threshold`); by default `n` and `threshold` are trainable hyperparameters, and a penalty of weight `beta_n`
|
|
60
|
+
nudges the budget smaller, trading a little accuracy for lower FPGA latency and resources.
|
|
61
|
+
|
|
62
|
+
```python
|
|
63
|
+
iq_conf = QuantizerConfig(place='datalane', q_type='kif', i0=4, f0=8, overflow_mode='WRAP')
|
|
64
|
+
|
|
65
|
+
with (
|
|
66
|
+
QuantizerConfigScope(place='all', default_q_type='kbi', overflow_mode='SAT_SYM'),
|
|
67
|
+
QuantizerConfigScope(place='datalane', default_q_type='kif', overflow_mode='WRAP'),
|
|
68
|
+
LayerConfigScope(enable_ebops=True, enable_iq=True, beta0=1e-5),
|
|
69
|
+
):
|
|
70
|
+
x_in = keras.Input(shape=(28, 28, 1), name='x_in')
|
|
71
|
+
|
|
72
|
+
# Sparse input reduction
|
|
73
|
+
x, keep_mask = InputReduce(
|
|
74
|
+
n=30, # initial pixel budget
|
|
75
|
+
threshold=0.1, # initial activity threshold
|
|
76
|
+
beta_n=1e-5, # weight of the pixel budget penalty
|
|
77
|
+
learn_n=True, # trainable pixel budget
|
|
78
|
+
learn_threshold=True, # trainable threshold
|
|
79
|
+
name='input_reduce',
|
|
80
|
+
)(x_in)
|
|
81
|
+
|
|
82
|
+
# Sparse convolution
|
|
83
|
+
x = QConv2DSparse(filters=3, kernel_size=3, name='conv1', padding='same', strides=1,
|
|
84
|
+
activation='relu', iq_conf=iq_conf)([x, keep_mask])
|
|
85
|
+
|
|
86
|
+
# Sparse pooling
|
|
87
|
+
x, keep_mask = AveragePooling2DSparse(2, name='pool1')([x, keep_mask])
|
|
88
|
+
|
|
89
|
+
x = Flatten(name='flatten')(x)
|
|
90
|
+
x = QDense(10, name='dense1', activation='relu', iq_conf=iq_conf)(x)
|
|
91
|
+
x = Activation('softmax', name='softmax')(x)
|
|
92
|
+
|
|
93
|
+
model = keras.Model(x_in, x)
|
|
94
|
+
```
|
|
95
|
+
|
|
96
|
+
Train the model, then read out the learned sparsity to deploy. `set_sparse_ebops_factor` makes the
|
|
97
|
+
EBOPS (a proxy for the quantized hardware cost) reflect the sparse compute rather than a dense one; a
|
|
98
|
+
cosine-decayed learning rate together with `restore_best_weights` keeps the learned budget from
|
|
99
|
+
over-compressing near the end of training. `plot_history` shows the loss breakdown, the learned
|
|
100
|
+
budget/threshold and the EBOPS in one figure, and the values to deploy are `layer.n_max_pixels` and
|
|
101
|
+
`layer.threshold`.
|
|
102
|
+
|
|
103
|
+
```python
|
|
104
|
+
set_sparse_ebops_factor(model)
|
|
105
|
+
|
|
106
|
+
steps_per_epoch = len(x_train) // 128
|
|
107
|
+
early_stop = keras.callbacks.EarlyStopping(monitor='val_accuracy', mode='max', patience=20, restore_best_weights=True)
|
|
108
|
+
model.compile(
|
|
109
|
+
optimizer=keras.optimizers.Adam(cosine_lr(1e-3, epochs=100, steps_per_epoch=steps_per_epoch)),
|
|
110
|
+
loss='categorical_crossentropy', metrics=['accuracy'],
|
|
111
|
+
)
|
|
112
|
+
history = model.fit(x_train, y_train, validation_data=(x_val, y_val),
|
|
113
|
+
epochs=100, batch_size=128, callbacks=[early_stop, SparseTrainingMonitor()])
|
|
114
|
+
|
|
115
|
+
plot_history(history, early_stopping=early_stop) # loss breakdown, budget, threshold, EBOPS
|
|
116
|
+
print_quantization(model) # per-layer bit-width distribution and EBOPS
|
|
117
|
+
plot_quantization(model)
|
|
118
|
+
|
|
119
|
+
ir = model.get_layer('input_reduce')
|
|
120
|
+
print(f"deploy with n_max_pixels={ir.n_max_pixels}, threshold={ir.threshold:.3f}")
|
|
121
|
+
```
|
|
122
|
+
|
|
123
|
+
## Converting a trained model to HLS with hls4ml
|
|
124
|
+
|
|
125
|
+
> **Note:** A [PR](https://github.com/fastmachinelearning/hls4ml/pull/1468) adding `sparsepixels` support to the official [hls4ml](https://github.com/fastmachinelearning/hls4ml) repo has been submitted but is not yet merged. In the meantime you can install hls4ml from the PR branch on this fork to try the converter:
|
|
126
|
+
>
|
|
127
|
+
> ```bash
|
|
128
|
+
> pip install "git+https://github.com/hftsoi/hls4ml.git@sparsepixels"
|
|
129
|
+
> ```
|
|
130
|
+
|
|
131
|
+
Once installed, converting a trained sparsepixels model to HLS is as usual:
|
|
132
|
+
|
|
133
|
+
```python
|
|
134
|
+
import hls4ml
|
|
135
|
+
|
|
136
|
+
hls_config = hls4ml.utils.config_from_keras_model(model, granularity='name')
|
|
137
|
+
hls_config.setdefault('Model', {})['PipelineStyle'] = 'dataflow' # use "#pragma HLS DATAFLOW" (instead of the default "#pragma HLS PIPELINE" for io_parallel)
|
|
138
|
+
|
|
139
|
+
hls_model = hls4ml.converters.convert_from_keras_model(
|
|
140
|
+
model,
|
|
141
|
+
hls_config=hls_config,
|
|
142
|
+
output_dir='hls_proj/my_sparse_cnn',
|
|
143
|
+
backend='Vitis',
|
|
144
|
+
io_type='io_parallel',
|
|
145
|
+
)
|
|
146
|
+
hls_model.write()
|
|
147
|
+
hls_model.compile()
|
|
148
|
+
y_hls = hls_model.predict(x_test)
|
|
149
|
+
```
|
|
150
|
+
|
|
151
|
+
## Documentation
|
|
152
|
+
|
|
153
|
+
Coming soon!
|
|
154
|
+
|
|
155
|
+
## Citation
|
|
156
|
+
|
|
157
|
+
If you find this useful in your research, please consider citing:
|
|
158
|
+
|
|
159
|
+
```
|
|
160
|
+
@article{Tsoi:2025nvg,
|
|
161
|
+
author = "Tsoi, Ho Fung and Rankin, Dylan and Loncar, Vladimir and Harris, Philip",
|
|
162
|
+
title = "{SparsePixels: Efficient Convolution for Sparse Data on FPGAs}",
|
|
163
|
+
eprint = "2512.06208",
|
|
164
|
+
archivePrefix = "arXiv",
|
|
165
|
+
primaryClass = "cs.AR",
|
|
166
|
+
month = "12",
|
|
167
|
+
year = "2025"
|
|
168
|
+
}
|
|
169
|
+
```
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
[metadata]
|
|
2
2
|
name = sparsepixels
|
|
3
|
-
version = 0.
|
|
3
|
+
version = 0.3.0
|
|
4
4
|
description = Efficient convolution for sparse data on FPGAs
|
|
5
5
|
author = Ho Fung Tsoi
|
|
6
6
|
author_email = ho.fung.tsoi@cern.ch
|
|
@@ -20,6 +20,7 @@ install_requires =
|
|
|
20
20
|
tensorflow
|
|
21
21
|
keras>=3.0
|
|
22
22
|
HGQ2>=0.1.8
|
|
23
|
+
matplotlib
|
|
23
24
|
include_package_data = True
|
|
24
25
|
|
|
25
26
|
[options.package_data]
|
|
@@ -0,0 +1,325 @@
|
|
|
1
|
+
import keras
|
|
2
|
+
from hgq.layers import QConv2D
|
|
3
|
+
from hgq.quantizer import Quantizer
|
|
4
|
+
from hgq.quantizer.config import QuantizerConfig
|
|
5
|
+
from keras import ops
|
|
6
|
+
from keras.layers import AveragePooling2D, MaxPooling2D
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
class _ClipToRange(keras.constraints.Constraint):
|
|
10
|
+
"""Weight constraint that clips values to the range [lo, hi] after each update."""
|
|
11
|
+
|
|
12
|
+
def __init__(self, lo, hi):
|
|
13
|
+
self.lo = float(lo)
|
|
14
|
+
self.hi = float(hi)
|
|
15
|
+
|
|
16
|
+
def __call__(self, w):
|
|
17
|
+
return ops.clip(w, self.lo, self.hi)
|
|
18
|
+
|
|
19
|
+
def get_config(self):
|
|
20
|
+
return {"lo": self.lo, "hi": self.hi}
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
class InputReduce(keras.layers.Layer):
|
|
24
|
+
"""Reduce a dense image to its first n active pixels for sparse FPGA inference.
|
|
25
|
+
|
|
26
|
+
Keeps the first n pixels whose first channel is above threshold, in raster order, and zeroes the
|
|
27
|
+
rest, returning the masked image together with a 0/1 keep mask that the following sparse layers
|
|
28
|
+
use as the sparse representation.
|
|
29
|
+
|
|
30
|
+
The budget n and the threshold can be learned during training (the default) so they need not be
|
|
31
|
+
tuned by hand; set learn_n or learn_threshold to False to keep either fixed (both False gives the
|
|
32
|
+
plain, non-learnable selection). The selection is always exact -- the learnable versions only
|
|
33
|
+
shape the gradient, so the layer behaves identically at inference and stays deployable. When n is
|
|
34
|
+
learned, a penalty of weight beta_n nudges it smaller, trading a little accuracy for lower FPGA
|
|
35
|
+
latency and resources; it starts at n and stays within [1, 4*n]. After training, read the values
|
|
36
|
+
to deploy from the n_max_pixels and threshold properties.
|
|
37
|
+
|
|
38
|
+
Args:
|
|
39
|
+
n: initial pixel budget, and the fixed budget when learn_n is False.
|
|
40
|
+
threshold: initial activity threshold on the first channel, fixed when learn_threshold is False.
|
|
41
|
+
beta_n: weight of the budget penalty added to the loss (0 disables it).
|
|
42
|
+
learn_n: make the pixel budget trainable.
|
|
43
|
+
learn_threshold: make the threshold trainable.
|
|
44
|
+
tau_threshold: softness of the threshold surrogate used to obtain gradients.
|
|
45
|
+
tau_n: softness of the budget-cutoff surrogate used to obtain gradients.
|
|
46
|
+
"""
|
|
47
|
+
|
|
48
|
+
def __init__(
|
|
49
|
+
self,
|
|
50
|
+
n=30,
|
|
51
|
+
threshold=0.0,
|
|
52
|
+
beta_n=1e-5,
|
|
53
|
+
learn_n=True,
|
|
54
|
+
learn_threshold=True,
|
|
55
|
+
tau_threshold=0.05,
|
|
56
|
+
tau_n=1.0,
|
|
57
|
+
**kwargs,
|
|
58
|
+
):
|
|
59
|
+
super().__init__(**kwargs)
|
|
60
|
+
self.n_init = int(n)
|
|
61
|
+
self.threshold_init = float(threshold)
|
|
62
|
+
self.beta_n = float(beta_n)
|
|
63
|
+
self.learn_n = learn_n
|
|
64
|
+
self.learn_threshold = learn_threshold
|
|
65
|
+
self.tau_threshold = float(tau_threshold)
|
|
66
|
+
self.tau_n = float(tau_n)
|
|
67
|
+
|
|
68
|
+
def build(self, input_shape):
|
|
69
|
+
if self.learn_threshold:
|
|
70
|
+
self.threshold_w = self.add_weight(
|
|
71
|
+
name="threshold",
|
|
72
|
+
shape=(),
|
|
73
|
+
initializer=keras.initializers.Constant(self.threshold_init),
|
|
74
|
+
trainable=True,
|
|
75
|
+
constraint=keras.constraints.NonNeg(),
|
|
76
|
+
)
|
|
77
|
+
if self.learn_n:
|
|
78
|
+
# Parametrize the budget as a fraction of its initial value (n = n_init * n_frac) so it
|
|
79
|
+
# moves at a useful rate under Adam; clip to [1, 4x] so it can shrink or modestly grow.
|
|
80
|
+
self.n_frac = self.add_weight(
|
|
81
|
+
name="n_frac",
|
|
82
|
+
shape=(),
|
|
83
|
+
initializer=keras.initializers.Constant(1.0),
|
|
84
|
+
trainable=True,
|
|
85
|
+
constraint=_ClipToRange(1.0 / self.n_init, 4.0),
|
|
86
|
+
)
|
|
87
|
+
super().build(input_shape)
|
|
88
|
+
|
|
89
|
+
def call(self, inputs):
|
|
90
|
+
dt = inputs.dtype
|
|
91
|
+
batch_size = ops.shape(inputs)[0]
|
|
92
|
+
h = ops.shape(inputs)[1]
|
|
93
|
+
w = ops.shape(inputs)[2]
|
|
94
|
+
score = ops.reshape(inputs[..., 0], [batch_size, h * w])
|
|
95
|
+
|
|
96
|
+
thr = self.threshold_w if self.learn_threshold else ops.cast(self.threshold_init, dt)
|
|
97
|
+
n = ops.cast(self.n_init, dt) * self.n_frac if self.learn_n else ops.cast(self.n_init, dt)
|
|
98
|
+
|
|
99
|
+
# Exact selection used for the forward pass.
|
|
100
|
+
active_hard = ops.cast(score > thr, dt)
|
|
101
|
+
rank_hard = ops.cumsum(active_hard, axis=1)
|
|
102
|
+
keep_hard = active_hard * ops.cast(rank_hard <= ops.round(n), dt)
|
|
103
|
+
|
|
104
|
+
if self.learn_threshold or self.learn_n:
|
|
105
|
+
# Differentiable surrogate used only for the gradient (straight-through to the exact
|
|
106
|
+
# selection above); the score > 0 gate keeps zero background pixels out when threshold=0.
|
|
107
|
+
active_soft = ops.sigmoid((score - thr) / self.tau_threshold) * ops.cast(score > 0, dt)
|
|
108
|
+
rank_soft = ops.cumsum(active_soft, axis=1)
|
|
109
|
+
keep_soft = active_soft * ops.sigmoid((n - rank_soft) / self.tau_n)
|
|
110
|
+
keep_flat = keep_soft + ops.stop_gradient(keep_hard - keep_soft)
|
|
111
|
+
else:
|
|
112
|
+
keep_flat = keep_hard
|
|
113
|
+
|
|
114
|
+
keep_mask = ops.reshape(keep_flat, [batch_size, h, w, 1])
|
|
115
|
+
inputs_reduced = inputs * keep_mask
|
|
116
|
+
|
|
117
|
+
if self.learn_n:
|
|
118
|
+
self.add_loss(self.beta_n * n)
|
|
119
|
+
|
|
120
|
+
return inputs_reduced, keep_mask
|
|
121
|
+
|
|
122
|
+
@property
|
|
123
|
+
def n_max_pixels(self):
|
|
124
|
+
"""Integer pixel budget to deploy (the initial value until the layer is built)."""
|
|
125
|
+
if self.learn_n and self.built:
|
|
126
|
+
return int(round(self.n_init * float(ops.convert_to_numpy(self.n_frac))))
|
|
127
|
+
return self.n_init
|
|
128
|
+
|
|
129
|
+
@property
|
|
130
|
+
def threshold(self):
|
|
131
|
+
"""Threshold to deploy (the initial value until the layer is built)."""
|
|
132
|
+
if self.learn_threshold and self.built:
|
|
133
|
+
return float(ops.convert_to_numpy(self.threshold_w))
|
|
134
|
+
return self.threshold_init
|
|
135
|
+
|
|
136
|
+
def get_config(self):
|
|
137
|
+
config = super().get_config()
|
|
138
|
+
config.update(
|
|
139
|
+
{
|
|
140
|
+
"n": self.n_init,
|
|
141
|
+
"threshold": self.threshold_init,
|
|
142
|
+
"beta_n": self.beta_n,
|
|
143
|
+
"learn_n": self.learn_n,
|
|
144
|
+
"learn_threshold": self.learn_threshold,
|
|
145
|
+
"tau_threshold": self.tau_threshold,
|
|
146
|
+
"tau_n": self.tau_n,
|
|
147
|
+
}
|
|
148
|
+
)
|
|
149
|
+
return config
|
|
150
|
+
|
|
151
|
+
|
|
152
|
+
class RemoveDilatedPixels(keras.layers.Layer):
|
|
153
|
+
"""Re-apply the keep mask, zeroing every pixel that is not active.
|
|
154
|
+
|
|
155
|
+
Multiplies a feature map by its 0/1 keep mask (broadcast over channels) so only the kept pixels
|
|
156
|
+
carry values. Used inside the sparse layers to restore the sparse representation after a dense op.
|
|
157
|
+
|
|
158
|
+
Call args:
|
|
159
|
+
inputs: tuple (x, mask) of the feature map and its keep mask.
|
|
160
|
+
"""
|
|
161
|
+
|
|
162
|
+
def __init__(self, **kwargs):
|
|
163
|
+
super().__init__(**kwargs)
|
|
164
|
+
|
|
165
|
+
def call(self, inputs):
|
|
166
|
+
x, mask = inputs
|
|
167
|
+
mask = ops.cast(mask, x.dtype)
|
|
168
|
+
return x * mask
|
|
169
|
+
|
|
170
|
+
def get_config(self):
|
|
171
|
+
return super().get_config()
|
|
172
|
+
|
|
173
|
+
|
|
174
|
+
class QConv2DSparse(keras.layers.Layer):
|
|
175
|
+
"""Quantized 2D convolution that operates on the sparse (active-pixel) representation.
|
|
176
|
+
|
|
177
|
+
Wraps an HGQ QConv2D: masks the input to the active pixels, convolves, adds a separately
|
|
178
|
+
quantized per-filter bias on the nonzero outputs, applies the activation, then re-masks the
|
|
179
|
+
output. This is numerically the same as a dense quantized conv restricted to the active pixels,
|
|
180
|
+
which is what the HLS sparse_conv kernel computes.
|
|
181
|
+
|
|
182
|
+
Args:
|
|
183
|
+
*conv_args: positional arguments forwarded to hgq.layers.QConv2D (e.g. filters, kernel_size).
|
|
184
|
+
**conv_kwargs: keyword arguments forwarded to QConv2D (padding, strides, ...). use_bias,
|
|
185
|
+
activation and bq_conf are handled here: the bias has its own weight and quantizer
|
|
186
|
+
(bq_conf), and the activation is applied after the bias.
|
|
187
|
+
|
|
188
|
+
Call args:
|
|
189
|
+
inputs: tuple (x, keep_mask) of the feature map and its keep mask.
|
|
190
|
+
"""
|
|
191
|
+
|
|
192
|
+
def __init__(self, *conv_args, **conv_kwargs):
|
|
193
|
+
super().__init__(name=conv_kwargs.get("name", None))
|
|
194
|
+
self._use_bias = conv_kwargs.pop("use_bias", True)
|
|
195
|
+
self._bq_conf = conv_kwargs.pop("bq_conf", None) or QuantizerConfig("default", "bias")
|
|
196
|
+
self._activation = keras.activations.get(conv_kwargs.pop("activation", None))
|
|
197
|
+
|
|
198
|
+
conv_kwargs["use_bias"] = False
|
|
199
|
+
conv_kwargs["activation"] = None
|
|
200
|
+
self.conv = QConv2D(*conv_args, **conv_kwargs)
|
|
201
|
+
self.masker = RemoveDilatedPixels()
|
|
202
|
+
|
|
203
|
+
def build(self, input_shape):
|
|
204
|
+
# Build the wrapped conv eagerly here rather than lazily in call(): building it while Keras
|
|
205
|
+
# symbolically traces call() triggers an HGQ weight check that fails in graph mode.
|
|
206
|
+
x_shape = input_shape[0]
|
|
207
|
+
if not self.conv.built:
|
|
208
|
+
self.conv.build(x_shape)
|
|
209
|
+
if self._use_bias:
|
|
210
|
+
self.sparse_bias = self.add_weight(
|
|
211
|
+
name="sparse_bias",
|
|
212
|
+
shape=(self.conv.filters,),
|
|
213
|
+
initializer="zeros",
|
|
214
|
+
trainable=True,
|
|
215
|
+
)
|
|
216
|
+
self._bq = Quantizer(self._bq_conf, name=f"{self.name}_bq")
|
|
217
|
+
self._bq.build((self.conv.filters,))
|
|
218
|
+
super().build(input_shape)
|
|
219
|
+
|
|
220
|
+
def compute_output_shape(self, input_shape):
|
|
221
|
+
# Return the shape directly so Keras does not trace call() (masking preserves the shape).
|
|
222
|
+
return self.conv.compute_output_shape(input_shape[0])
|
|
223
|
+
|
|
224
|
+
def call(self, inputs, **kwargs):
|
|
225
|
+
x, keep_mask = inputs
|
|
226
|
+
x = self.masker((x, keep_mask))
|
|
227
|
+
y = self.conv(x, **kwargs)
|
|
228
|
+
|
|
229
|
+
if self._use_bias:
|
|
230
|
+
b = self._bq(self.sparse_bias)
|
|
231
|
+
b = ops.reshape(b, (1, 1, 1, -1))
|
|
232
|
+
non_zero = ops.cast(y != 0, y.dtype)
|
|
233
|
+
y = y + b * non_zero
|
|
234
|
+
|
|
235
|
+
if self._activation is not None:
|
|
236
|
+
y = self._activation(y)
|
|
237
|
+
|
|
238
|
+
y = self.masker((y, keep_mask))
|
|
239
|
+
return y
|
|
240
|
+
|
|
241
|
+
def get_config(self):
|
|
242
|
+
cfg = super().get_config()
|
|
243
|
+
cfg["conv_config"] = self.conv.get_config()
|
|
244
|
+
cfg["use_bias"] = self._use_bias
|
|
245
|
+
cfg["bq_conf"] = self._bq_conf
|
|
246
|
+
cfg["activation"] = keras.activations.serialize(self._activation)
|
|
247
|
+
return cfg
|
|
248
|
+
|
|
249
|
+
@classmethod
|
|
250
|
+
def from_config(cls, config):
|
|
251
|
+
conv_cfg = config.pop("conv_config")
|
|
252
|
+
use_bias = config.pop("use_bias", True)
|
|
253
|
+
bq_conf = config.pop("bq_conf", None)
|
|
254
|
+
activation = config.pop("activation", None)
|
|
255
|
+
return cls(**conv_cfg, use_bias=use_bias, bq_conf=bq_conf, activation=activation)
|
|
256
|
+
|
|
257
|
+
|
|
258
|
+
class AveragePooling2DSparse(keras.layers.Layer):
|
|
259
|
+
"""Average pooling on the sparse representation.
|
|
260
|
+
|
|
261
|
+
Average-pools the feature map and max-pools the keep mask, so a pooled cell stays active when any
|
|
262
|
+
of its source pixels were active. Mirrors the HLS sparse_pooling_avg kernel.
|
|
263
|
+
|
|
264
|
+
Args:
|
|
265
|
+
*pool_args: positional arguments forwarded to keras AveragePooling2D (e.g. pool_size).
|
|
266
|
+
**pool_kwargs: keyword arguments forwarded to the pooling layers.
|
|
267
|
+
|
|
268
|
+
Call args:
|
|
269
|
+
inputs: tuple (x, keep_mask) of the feature map and its keep mask.
|
|
270
|
+
"""
|
|
271
|
+
|
|
272
|
+
def __init__(self, *pool_args, **pool_kwargs):
|
|
273
|
+
super().__init__(name=pool_kwargs.get("name", None))
|
|
274
|
+
self.avg_pool = AveragePooling2D(*pool_args, **pool_kwargs)
|
|
275
|
+
self.max_pool = MaxPooling2D(*pool_args, **pool_kwargs)
|
|
276
|
+
|
|
277
|
+
def call(self, inputs, **kwargs):
|
|
278
|
+
x, keep_mask = inputs
|
|
279
|
+
y = self.avg_pool(x, **kwargs)
|
|
280
|
+
keep_mask_pooled = self.max_pool(keep_mask)
|
|
281
|
+
return y, keep_mask_pooled
|
|
282
|
+
|
|
283
|
+
def get_config(self):
|
|
284
|
+
cfg = super().get_config()
|
|
285
|
+
cfg["pool_config"] = self.avg_pool.get_config()
|
|
286
|
+
return cfg
|
|
287
|
+
|
|
288
|
+
@classmethod
|
|
289
|
+
def from_config(cls, config):
|
|
290
|
+
pool_cfg = config.pop("pool_config")
|
|
291
|
+
return cls(**pool_cfg)
|
|
292
|
+
|
|
293
|
+
|
|
294
|
+
class MaxPooling2DSparse(keras.layers.Layer):
|
|
295
|
+
"""Max pooling on the sparse representation.
|
|
296
|
+
|
|
297
|
+
Max-pools both the feature map and the keep mask. Mirrors the HLS sparse_pooling_max kernel.
|
|
298
|
+
|
|
299
|
+
Args:
|
|
300
|
+
*pool_args: positional arguments forwarded to keras MaxPooling2D (e.g. pool_size).
|
|
301
|
+
**pool_kwargs: keyword arguments forwarded to the pooling layer.
|
|
302
|
+
|
|
303
|
+
Call args:
|
|
304
|
+
inputs: tuple (x, keep_mask) of the feature map and its keep mask.
|
|
305
|
+
"""
|
|
306
|
+
|
|
307
|
+
def __init__(self, *pool_args, **pool_kwargs):
|
|
308
|
+
super().__init__(name=pool_kwargs.get("name", None))
|
|
309
|
+
self.max_pool = MaxPooling2D(*pool_args, **pool_kwargs)
|
|
310
|
+
|
|
311
|
+
def call(self, inputs, **kwargs):
|
|
312
|
+
x, keep_mask = inputs
|
|
313
|
+
y = self.max_pool(x, **kwargs)
|
|
314
|
+
keep_mask_pooled = self.max_pool(keep_mask)
|
|
315
|
+
return y, keep_mask_pooled
|
|
316
|
+
|
|
317
|
+
def get_config(self):
|
|
318
|
+
cfg = super().get_config()
|
|
319
|
+
cfg["pool_config"] = self.max_pool.get_config()
|
|
320
|
+
return cfg
|
|
321
|
+
|
|
322
|
+
@classmethod
|
|
323
|
+
def from_config(cls, config):
|
|
324
|
+
pool_cfg = config.pop("pool_config")
|
|
325
|
+
return cls(**pool_cfg)
|