neurovisionx 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- neurovisionx-0.1.0/LICENSE +21 -0
- neurovisionx-0.1.0/PKG-INFO +203 -0
- neurovisionx-0.1.0/README.md +168 -0
- neurovisionx-0.1.0/neurovisionx/__init__.py +42 -0
- neurovisionx-0.1.0/neurovisionx/backbone.py +42 -0
- neurovisionx-0.1.0/neurovisionx/dataset.py +87 -0
- neurovisionx-0.1.0/neurovisionx/detection_head.py +34 -0
- neurovisionx-0.1.0/neurovisionx/layers.py +87 -0
- neurovisionx-0.1.0/neurovisionx/model.py +58 -0
- neurovisionx-0.1.0/neurovisionx/trainer.py +100 -0
- neurovisionx-0.1.0/neurovisionx/utils.py +86 -0
- neurovisionx-0.1.0/neurovisionx.egg-info/PKG-INFO +203 -0
- neurovisionx-0.1.0/neurovisionx.egg-info/SOURCES.txt +16 -0
- neurovisionx-0.1.0/neurovisionx.egg-info/dependency_links.txt +1 -0
- neurovisionx-0.1.0/neurovisionx.egg-info/requires.txt +10 -0
- neurovisionx-0.1.0/neurovisionx.egg-info/top_level.txt +1 -0
- neurovisionx-0.1.0/pyproject.toml +44 -0
- neurovisionx-0.1.0/setup.cfg +4 -0
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 Srikanth Sridhar
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
|
@@ -0,0 +1,203 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: neurovisionx
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Unified CNN library for image classification and anchor-free object detection with Adaptive Feature Pyramid Attention (AFPA).
|
|
5
|
+
Author-email: Srikanth Sridhar <srisrikanthtvs@gmail.com>
|
|
6
|
+
License: MIT
|
|
7
|
+
Project-URL: Homepage, https://github.com/sricodings/neurovisionx
|
|
8
|
+
Project-URL: Documentation, https://github.com/sricodings/neurovisionx#readme
|
|
9
|
+
Project-URL: Issues, https://github.com/sricodings/neurovisionx/issues
|
|
10
|
+
Keywords: deep learning,computer vision,object detection,image classification,pytorch,cnn,attention
|
|
11
|
+
Classifier: Development Status :: 4 - Beta
|
|
12
|
+
Classifier: Intended Audience :: Developers
|
|
13
|
+
Classifier: Intended Audience :: Science/Research
|
|
14
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
15
|
+
Classifier: Programming Language :: Python :: 3
|
|
16
|
+
Classifier: Programming Language :: Python :: 3.8
|
|
17
|
+
Classifier: Programming Language :: Python :: 3.9
|
|
18
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
19
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
20
|
+
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
|
21
|
+
Classifier: Topic :: Scientific/Engineering :: Image Recognition
|
|
22
|
+
Requires-Python: >=3.8
|
|
23
|
+
Description-Content-Type: text/markdown
|
|
24
|
+
License-File: LICENSE
|
|
25
|
+
Requires-Dist: torch>=1.13
|
|
26
|
+
Requires-Dist: torchvision>=0.14
|
|
27
|
+
Requires-Dist: pillow>=9.0
|
|
28
|
+
Provides-Extra: dev
|
|
29
|
+
Requires-Dist: pytest; extra == "dev"
|
|
30
|
+
Requires-Dist: black; extra == "dev"
|
|
31
|
+
Requires-Dist: flake8; extra == "dev"
|
|
32
|
+
Requires-Dist: build; extra == "dev"
|
|
33
|
+
Requires-Dist: twine; extra == "dev"
|
|
34
|
+
Dynamic: license-file
|
|
35
|
+
|
|
36
|
+
# NeuroVisionX
|
|
37
|
+
|
|
38
|
+
**A unified PyTorch library for image classification and anchor-free object detection**,
|
|
39
|
+
built around a single shared backbone and a custom attention module: **AFPA (Adaptive
|
|
40
|
+
Feature Pyramid Attention)**.
|
|
41
|
+
|
|
42
|
+
[](https://pypi.org/project/neurovisionx/)
|
|
43
|
+
[](LICENSE)
|
|
44
|
+
[](https://www.python.org/)
|
|
45
|
+
|
|
46
|
+
---
|
|
47
|
+
|
|
48
|
+
## Why NeuroVisionX
|
|
49
|
+
|
|
50
|
+
Most beginner-friendly libraries make you choose: a simple classifier *or* a heavy,
|
|
51
|
+
hard-to-configure detector (YOLO/SSD-style with anchor boxes and many hyperparameters).
|
|
52
|
+
|
|
53
|
+
NeuroVisionX gives you **both, from one backbone**:
|
|
54
|
+
|
|
55
|
+
- **One backbone, two heads.** Train a classifier, a detector, or both, without
|
|
56
|
+
duplicating feature extraction code.
|
|
57
|
+
- **AFPA attention block.** Lightweight channel+spatial attention with a learnable
|
|
58
|
+
residual gate, so it starts as identity and "switches on" only when useful —
|
|
59
|
+
more stable than dropping raw attention into a network from scratch.
|
|
60
|
+
- **Anchor-free detection head.** Predicts object centers, sizes, and sub-pixel
|
|
61
|
+
offsets directly (CenterNet/FCOS-style) — no anchor box tuning.
|
|
62
|
+
- **One `Trainer` class** for both tasks, with sensible defaults.
|
|
63
|
+
- **Minimal dependencies**: `torch`, `torchvision`, `pillow`. Nothing exotic.
|
|
64
|
+
|
|
65
|
+
## Installation
|
|
66
|
+
|
|
67
|
+
```bash
|
|
68
|
+
pip install neurovisionx
|
|
69
|
+
```
|
|
70
|
+
|
|
71
|
+
For local development:
|
|
72
|
+
|
|
73
|
+
```bash
|
|
74
|
+
git clone https://github.com/sricodings/neurovisionx.git
|
|
75
|
+
cd neurovisionx
|
|
76
|
+
pip install -e ".[dev]"
|
|
77
|
+
```
|
|
78
|
+
|
|
79
|
+
## Quick start
|
|
80
|
+
|
|
81
|
+
### Classification
|
|
82
|
+
|
|
83
|
+
```python
|
|
84
|
+
import neurovisionx as nv
|
|
85
|
+
from torch.utils.data import DataLoader
|
|
86
|
+
|
|
87
|
+
train_ds = nv.ImageFolderDataset("data/train", image_size=224)
|
|
88
|
+
train_loader = DataLoader(train_ds, batch_size=32, shuffle=True)
|
|
89
|
+
|
|
90
|
+
model = nv.NeuroVisionXNet(num_classes=len(train_ds.classes), mode="classify")
|
|
91
|
+
trainer = nv.Trainer(model, train_loader, task="classify")
|
|
92
|
+
trainer.fit(epochs=20)
|
|
93
|
+
```
|
|
94
|
+
|
|
95
|
+
### Object detection
|
|
96
|
+
|
|
97
|
+
```python
|
|
98
|
+
import neurovisionx as nv
|
|
99
|
+
from torch.utils.data import DataLoader
|
|
100
|
+
|
|
101
|
+
train_ds = nv.DetectionDataset("data/images", "data/annotations.json", image_size=512)
|
|
102
|
+
train_loader = DataLoader(train_ds, batch_size=8, shuffle=True,
|
|
103
|
+
collate_fn=nv.dataset.detection_collate_fn)
|
|
104
|
+
|
|
105
|
+
model = nv.NeuroVisionXNet(num_classes=5, mode="detect")
|
|
106
|
+
trainer = nv.Trainer(model, train_loader, task="detect")
|
|
107
|
+
trainer.fit(epochs=30)
|
|
108
|
+
|
|
109
|
+
detections = model.detect(image_tensor, score_thresh=0.4)
|
|
110
|
+
```
|
|
111
|
+
|
|
112
|
+
Annotation format (`annotations.json`) — simple COCO-lite JSON:
|
|
113
|
+
|
|
114
|
+
```json
|
|
115
|
+
[
|
|
116
|
+
{"image": "img1.jpg", "boxes": [[34, 12, 200, 180]], "labels": [0]},
|
|
117
|
+
{"image": "img2.jpg", "boxes": [[10, 10, 50, 50], [60, 60, 120, 140]], "labels": [2, 4]}
|
|
118
|
+
]
|
|
119
|
+
```
|
|
120
|
+
|
|
121
|
+
## Architecture overview
|
|
122
|
+
|
|
123
|
+
```
|
|
124
|
+
Input image
|
|
125
|
+
│
|
|
126
|
+
▼
|
|
127
|
+
Stem conv
|
|
128
|
+
│
|
|
129
|
+
▼
|
|
130
|
+
Stage 1 (downsample + AFPA) ─────────────┐
|
|
131
|
+
│ │
|
|
132
|
+
▼ │
|
|
133
|
+
Stage 2 (downsample + AFPA) -> C3 │ multi-scale
|
|
134
|
+
│ │ feature maps
|
|
135
|
+
▼ │
|
|
136
|
+
Stage 3 (downsample + AFPA) -> C4 ──┐ │
|
|
137
|
+
│ │ │
|
|
138
|
+
▼ │ │
|
|
139
|
+
Stage 4 (downsample + AFPA) -> C5 │ │
|
|
140
|
+
│ │ │
|
|
141
|
+
├──> Classifier head (GAP + FC) │ │
|
|
142
|
+
│ ▼ │
|
|
143
|
+
└──> 1x1 conv + upsample ──> fuse with C4
|
|
144
|
+
│
|
|
145
|
+
▼
|
|
146
|
+
Anchor-free detection head
|
|
147
|
+
(heatmap, size, offset maps)
|
|
148
|
+
```
|
|
149
|
+
|
|
150
|
+
## Repository layout
|
|
151
|
+
|
|
152
|
+
```
|
|
153
|
+
neurovisionx/
|
|
154
|
+
├── neurovisionx/
|
|
155
|
+
│ ├── __init__.py # public API
|
|
156
|
+
│ ├── layers.py # ConvBNAct, AFPABlock (the novel attention module)
|
|
157
|
+
│ ├── backbone.py # multi-stage CNN backbone
|
|
158
|
+
│ ├── detection_head.py # anchor-free detection head
|
|
159
|
+
│ ├── model.py # NeuroVisionXNet (combines backbone + heads)
|
|
160
|
+
│ ├── dataset.py # ImageFolderDataset, DetectionDataset
|
|
161
|
+
│ ├── trainer.py # Trainer (training loop, losses)
|
|
162
|
+
│ └── utils.py # IoU, NMS, heatmap decoding, visualization
|
|
163
|
+
├── examples/
|
|
164
|
+
│ ├── train_classifier.py
|
|
165
|
+
│ └── train_detector.py
|
|
166
|
+
├── tests/
|
|
167
|
+
├── docs/
|
|
168
|
+
│ └── PUBLISHING_GUIDE.md
|
|
169
|
+
├── pyproject.toml
|
|
170
|
+
├── LICENSE
|
|
171
|
+
└── README.md
|
|
172
|
+
```
|
|
173
|
+
|
|
174
|
+
## Roadmap (suggested, fill in as you build)
|
|
175
|
+
|
|
176
|
+
- [ ] Pretrained backbone weights (ImageNet-style pretraining)
|
|
177
|
+
- [ ] ONNX export for deployment
|
|
178
|
+
- [ ] Mixed-precision training support
|
|
179
|
+
- [ ] Data augmentation pipeline (mosaic, mixup, random crop)
|
|
180
|
+
- [ ] Benchmark results vs ResNet/YOLO baselines on COCO subset
|
|
181
|
+
- [ ] Sphinx-generated documentation site
|
|
182
|
+
|
|
183
|
+
## Contributing
|
|
184
|
+
|
|
185
|
+
Pull requests are welcome. Please open an issue first to discuss major changes.
|
|
186
|
+
Run `pytest` and `black .` before submitting.
|
|
187
|
+
|
|
188
|
+
## Citing
|
|
189
|
+
|
|
190
|
+
If you use NeuroVisionX in research, please cite:
|
|
191
|
+
|
|
192
|
+
```bibtex
|
|
193
|
+
@software{neurovisionx2026,
|
|
194
|
+
author = {YOUR NAME},
|
|
195
|
+
title = {NeuroVisionX: Unified Classification and Anchor-Free Detection with Adaptive Feature Pyramid Attention},
|
|
196
|
+
year = {2026},
|
|
197
|
+
url = {https://github.com/YOUR_USERNAME/neurovisionx}
|
|
198
|
+
}
|
|
199
|
+
```
|
|
200
|
+
|
|
201
|
+
## License
|
|
202
|
+
|
|
203
|
+
MIT License — see [LICENSE](LICENSE).
|
|
@@ -0,0 +1,168 @@
|
|
|
1
|
+
# NeuroVisionX
|
|
2
|
+
|
|
3
|
+
**A unified PyTorch library for image classification and anchor-free object detection**,
|
|
4
|
+
built around a single shared backbone and a custom attention module: **AFPA (Adaptive
|
|
5
|
+
Feature Pyramid Attention)**.
|
|
6
|
+
|
|
7
|
+
[](https://pypi.org/project/neurovisionx/)
|
|
8
|
+
[](LICENSE)
|
|
9
|
+
[](https://www.python.org/)
|
|
10
|
+
|
|
11
|
+
---
|
|
12
|
+
|
|
13
|
+
## Why NeuroVisionX
|
|
14
|
+
|
|
15
|
+
Most beginner-friendly libraries make you choose: a simple classifier *or* a heavy,
|
|
16
|
+
hard-to-configure detector (YOLO/SSD-style with anchor boxes and many hyperparameters).
|
|
17
|
+
|
|
18
|
+
NeuroVisionX gives you **both, from one backbone**:
|
|
19
|
+
|
|
20
|
+
- **One backbone, two heads.** Train a classifier, a detector, or both, without
|
|
21
|
+
duplicating feature extraction code.
|
|
22
|
+
- **AFPA attention block.** Lightweight channel+spatial attention with a learnable
|
|
23
|
+
residual gate, so it starts as identity and "switches on" only when useful —
|
|
24
|
+
more stable than dropping raw attention into a network from scratch.
|
|
25
|
+
- **Anchor-free detection head.** Predicts object centers, sizes, and sub-pixel
|
|
26
|
+
offsets directly (CenterNet/FCOS-style) — no anchor box tuning.
|
|
27
|
+
- **One `Trainer` class** for both tasks, with sensible defaults.
|
|
28
|
+
- **Minimal dependencies**: `torch`, `torchvision`, `pillow`. Nothing exotic.
|
|
29
|
+
|
|
30
|
+
## Installation
|
|
31
|
+
|
|
32
|
+
```bash
|
|
33
|
+
pip install neurovisionx
|
|
34
|
+
```
|
|
35
|
+
|
|
36
|
+
For local development:
|
|
37
|
+
|
|
38
|
+
```bash
|
|
39
|
+
git clone https://github.com/sricodings/neurovisionx.git
|
|
40
|
+
cd neurovisionx
|
|
41
|
+
pip install -e ".[dev]"
|
|
42
|
+
```
|
|
43
|
+
|
|
44
|
+
## Quick start
|
|
45
|
+
|
|
46
|
+
### Classification
|
|
47
|
+
|
|
48
|
+
```python
|
|
49
|
+
import neurovisionx as nv
|
|
50
|
+
from torch.utils.data import DataLoader
|
|
51
|
+
|
|
52
|
+
train_ds = nv.ImageFolderDataset("data/train", image_size=224)
|
|
53
|
+
train_loader = DataLoader(train_ds, batch_size=32, shuffle=True)
|
|
54
|
+
|
|
55
|
+
model = nv.NeuroVisionXNet(num_classes=len(train_ds.classes), mode="classify")
|
|
56
|
+
trainer = nv.Trainer(model, train_loader, task="classify")
|
|
57
|
+
trainer.fit(epochs=20)
|
|
58
|
+
```
|
|
59
|
+
|
|
60
|
+
### Object detection
|
|
61
|
+
|
|
62
|
+
```python
|
|
63
|
+
import neurovisionx as nv
|
|
64
|
+
from torch.utils.data import DataLoader
|
|
65
|
+
|
|
66
|
+
train_ds = nv.DetectionDataset("data/images", "data/annotations.json", image_size=512)
|
|
67
|
+
train_loader = DataLoader(train_ds, batch_size=8, shuffle=True,
|
|
68
|
+
collate_fn=nv.dataset.detection_collate_fn)
|
|
69
|
+
|
|
70
|
+
model = nv.NeuroVisionXNet(num_classes=5, mode="detect")
|
|
71
|
+
trainer = nv.Trainer(model, train_loader, task="detect")
|
|
72
|
+
trainer.fit(epochs=30)
|
|
73
|
+
|
|
74
|
+
detections = model.detect(image_tensor, score_thresh=0.4)
|
|
75
|
+
```
|
|
76
|
+
|
|
77
|
+
Annotation format (`annotations.json`) — simple COCO-lite JSON:
|
|
78
|
+
|
|
79
|
+
```json
|
|
80
|
+
[
|
|
81
|
+
{"image": "img1.jpg", "boxes": [[34, 12, 200, 180]], "labels": [0]},
|
|
82
|
+
{"image": "img2.jpg", "boxes": [[10, 10, 50, 50], [60, 60, 120, 140]], "labels": [2, 4]}
|
|
83
|
+
]
|
|
84
|
+
```
|
|
85
|
+
|
|
86
|
+
## Architecture overview
|
|
87
|
+
|
|
88
|
+
```
|
|
89
|
+
Input image
|
|
90
|
+
│
|
|
91
|
+
▼
|
|
92
|
+
Stem conv
|
|
93
|
+
│
|
|
94
|
+
▼
|
|
95
|
+
Stage 1 (downsample + AFPA) ─────────────┐
|
|
96
|
+
│ │
|
|
97
|
+
▼ │
|
|
98
|
+
Stage 2 (downsample + AFPA) -> C3 │ multi-scale
|
|
99
|
+
│ │ feature maps
|
|
100
|
+
▼ │
|
|
101
|
+
Stage 3 (downsample + AFPA) -> C4 ──┐ │
|
|
102
|
+
│ │ │
|
|
103
|
+
▼ │ │
|
|
104
|
+
Stage 4 (downsample + AFPA) -> C5 │ │
|
|
105
|
+
│ │ │
|
|
106
|
+
├──> Classifier head (GAP + FC) │ │
|
|
107
|
+
│ ▼ │
|
|
108
|
+
└──> 1x1 conv + upsample ──> fuse with C4
|
|
109
|
+
│
|
|
110
|
+
▼
|
|
111
|
+
Anchor-free detection head
|
|
112
|
+
(heatmap, size, offset maps)
|
|
113
|
+
```
|
|
114
|
+
|
|
115
|
+
## Repository layout
|
|
116
|
+
|
|
117
|
+
```
|
|
118
|
+
neurovisionx/
|
|
119
|
+
├── neurovisionx/
|
|
120
|
+
│ ├── __init__.py # public API
|
|
121
|
+
│ ├── layers.py # ConvBNAct, AFPABlock (the novel attention module)
|
|
122
|
+
│ ├── backbone.py # multi-stage CNN backbone
|
|
123
|
+
│ ├── detection_head.py # anchor-free detection head
|
|
124
|
+
│ ├── model.py # NeuroVisionXNet (combines backbone + heads)
|
|
125
|
+
│ ├── dataset.py # ImageFolderDataset, DetectionDataset
|
|
126
|
+
│ ├── trainer.py # Trainer (training loop, losses)
|
|
127
|
+
│ └── utils.py # IoU, NMS, heatmap decoding, visualization
|
|
128
|
+
├── examples/
|
|
129
|
+
│ ├── train_classifier.py
|
|
130
|
+
│ └── train_detector.py
|
|
131
|
+
├── tests/
|
|
132
|
+
├── docs/
|
|
133
|
+
│ └── PUBLISHING_GUIDE.md
|
|
134
|
+
├── pyproject.toml
|
|
135
|
+
├── LICENSE
|
|
136
|
+
└── README.md
|
|
137
|
+
```
|
|
138
|
+
|
|
139
|
+
## Roadmap (suggested, fill in as you build)
|
|
140
|
+
|
|
141
|
+
- [ ] Pretrained backbone weights (ImageNet-style pretraining)
|
|
142
|
+
- [ ] ONNX export for deployment
|
|
143
|
+
- [ ] Mixed-precision training support
|
|
144
|
+
- [ ] Data augmentation pipeline (mosaic, mixup, random crop)
|
|
145
|
+
- [ ] Benchmark results vs ResNet/YOLO baselines on COCO subset
|
|
146
|
+
- [ ] Sphinx-generated documentation site
|
|
147
|
+
|
|
148
|
+
## Contributing
|
|
149
|
+
|
|
150
|
+
Pull requests are welcome. Please open an issue first to discuss major changes.
|
|
151
|
+
Run `pytest` and `black .` before submitting.
|
|
152
|
+
|
|
153
|
+
## Citing
|
|
154
|
+
|
|
155
|
+
If you use NeuroVisionX in research, please cite:
|
|
156
|
+
|
|
157
|
+
```bibtex
|
|
158
|
+
@software{neurovisionx2026,
|
|
159
|
+
author = {YOUR NAME},
|
|
160
|
+
title = {NeuroVisionX: Unified Classification and Anchor-Free Detection with Adaptive Feature Pyramid Attention},
|
|
161
|
+
year = {2026},
|
|
162
|
+
url = {https://github.com/YOUR_USERNAME/neurovisionx}
|
|
163
|
+
}
|
|
164
|
+
```
|
|
165
|
+
|
|
166
|
+
## License
|
|
167
|
+
|
|
168
|
+
MIT License — see [LICENSE](LICENSE).
|
|
@@ -0,0 +1,42 @@
|
|
|
1
|
+
"""
|
|
2
|
+
NeuroVisionX
|
|
3
|
+
===========
|
|
4
|
+
A unified deep-learning library for image classification AND object detection,
|
|
5
|
+
built around a single shared backbone with a novel attention module called
|
|
6
|
+
AFPA (Adaptive Feature Pyramid Attention).
|
|
7
|
+
|
|
8
|
+
Why it's different from "just another CNN wrapper":
|
|
9
|
+
- One backbone, two heads (classification + detection) trained jointly or separately.
|
|
10
|
+
- AFPA blocks fuse multi-scale features with learned channel+spatial gating,
|
|
11
|
+
cheaper than full transformer attention but more expressive than plain FPN.
|
|
12
|
+
- Anchor-free detection head (center + size regression) -> simpler, fewer
|
|
13
|
+
hyperparameters than anchor-based YOLO/SSD style heads.
|
|
14
|
+
- Single training loop (Trainer) handles both tasks via a configurable loss mix.
|
|
15
|
+
|
|
16
|
+
Quick start
|
|
17
|
+
-----------
|
|
18
|
+
import neurovisionx as nv
|
|
19
|
+
|
|
20
|
+
model = nv.NeuroVisionXNet(num_classes=10, mode="both")
|
|
21
|
+
trainer = nv.Trainer(model, train_loader, val_loader)
|
|
22
|
+
trainer.fit(epochs=20)
|
|
23
|
+
|
|
24
|
+
detections = model.detect(image_tensor)
|
|
25
|
+
"""
|
|
26
|
+
|
|
27
|
+
from .layers import AFPABlock, ConvBNAct
|
|
28
|
+
from .backbone import NeuroVisionXBackbone
|
|
29
|
+
from .detection_head import DetectionHead
|
|
30
|
+
from .model import NeuroVisionXNet
|
|
31
|
+
from .dataset import ImageFolderDataset, DetectionDataset
|
|
32
|
+
from .trainer import Trainer
|
|
33
|
+
from .utils import nms, iou, visualize_detections
|
|
34
|
+
|
|
35
|
+
__version__ = "0.1.0"
|
|
36
|
+
__all__ = [
|
|
37
|
+
"AFPABlock", "ConvBNAct",
|
|
38
|
+
"NeuroVisionXBackbone", "DetectionHead", "NeuroVisionXNet",
|
|
39
|
+
"ImageFolderDataset", "DetectionDataset",
|
|
40
|
+
"Trainer",
|
|
41
|
+
"nms", "iou", "visualize_detections",
|
|
42
|
+
]
|
|
@@ -0,0 +1,42 @@
|
|
|
1
|
+
"""
|
|
2
|
+
NeuroVisionBackbone: a multi-stage CNN backbone with AFPA attention inserted
|
|
3
|
+
after every downsampling stage. Returns a list of multi-scale feature maps
|
|
4
|
+
(C3, C4, C5) so the same backbone can feed both a classification head and a
|
|
5
|
+
detection head (feature pyramid style).
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
import torch
|
|
9
|
+
import torch.nn as nn
|
|
10
|
+
from .layers import ConvBNAct, AFPABlock
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
class Stage(nn.Module):
|
|
14
|
+
def __init__(self, in_ch, out_ch, num_blocks=2, stride=2):
|
|
15
|
+
super().__init__()
|
|
16
|
+
layers = [ConvBNAct(in_ch, out_ch, stride=stride)]
|
|
17
|
+
for _ in range(num_blocks - 1):
|
|
18
|
+
layers.append(ConvBNAct(out_ch, out_ch, stride=1))
|
|
19
|
+
layers.append(AFPABlock(out_ch))
|
|
20
|
+
self.block = nn.Sequential(*layers)
|
|
21
|
+
|
|
22
|
+
def forward(self, x):
|
|
23
|
+
return self.block(x)
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
class NeuroVisionBackbone(nn.Module):
|
|
27
|
+
def __init__(self, in_channels=3, width=(32, 64, 128, 256, 512), depth=(1, 2, 2, 3, 3)):
|
|
28
|
+
super().__init__()
|
|
29
|
+
self.stem = ConvBNAct(in_channels, width[0], kernel_size=3, stride=1)
|
|
30
|
+
self.stage1 = Stage(width[0], width[1], num_blocks=depth[1], stride=2)
|
|
31
|
+
self.stage2 = Stage(width[1], width[2], num_blocks=depth[2], stride=2) # C3
|
|
32
|
+
self.stage3 = Stage(width[2], width[3], num_blocks=depth[3], stride=2) # C4
|
|
33
|
+
self.stage4 = Stage(width[3], width[4], num_blocks=depth[4], stride=2) # C5
|
|
34
|
+
self.out_channels = {"C3": width[2], "C4": width[3], "C5": width[4]}
|
|
35
|
+
|
|
36
|
+
def forward(self, x):
|
|
37
|
+
x = self.stem(x)
|
|
38
|
+
x = self.stage1(x)
|
|
39
|
+
c3 = self.stage2(x)
|
|
40
|
+
c4 = self.stage3(c3)
|
|
41
|
+
c5 = self.stage4(c4)
|
|
42
|
+
return {"C3": c3, "C4": c4, "C5": c5}
|
|
@@ -0,0 +1,87 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Lightweight dataset classes. No heavy dependency beyond torch + PIL, so the
|
|
3
|
+
library stays easy to install and doesn't force a specific data format.
|
|
4
|
+
"""
|
|
5
|
+
|
|
6
|
+
import os
|
|
7
|
+
import json
|
|
8
|
+
import torch
|
|
9
|
+
from torch.utils.data import Dataset
|
|
10
|
+
from PIL import Image
|
|
11
|
+
import torchvision.transforms as T
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
class ImageFolderDataset(Dataset):
|
|
15
|
+
"""Expects: root/class_name/image.jpg layout."""
|
|
16
|
+
|
|
17
|
+
def __init__(self, root, image_size=224, transform=None):
|
|
18
|
+
self.root = root
|
|
19
|
+
self.classes = sorted(
|
|
20
|
+
d for d in os.listdir(root) if os.path.isdir(os.path.join(root, d))
|
|
21
|
+
)
|
|
22
|
+
self.class_to_idx = {c: i for i, c in enumerate(self.classes)}
|
|
23
|
+
self.samples = []
|
|
24
|
+
for c in self.classes:
|
|
25
|
+
class_dir = os.path.join(root, c)
|
|
26
|
+
for fname in os.listdir(class_dir):
|
|
27
|
+
if fname.lower().endswith((".jpg", ".jpeg", ".png")):
|
|
28
|
+
self.samples.append((os.path.join(class_dir, fname), self.class_to_idx[c]))
|
|
29
|
+
|
|
30
|
+
self.transform = transform or T.Compose([
|
|
31
|
+
T.Resize((image_size, image_size)),
|
|
32
|
+
T.ToTensor(),
|
|
33
|
+
T.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
|
|
34
|
+
])
|
|
35
|
+
|
|
36
|
+
def __len__(self):
|
|
37
|
+
return len(self.samples)
|
|
38
|
+
|
|
39
|
+
def __getitem__(self, idx):
|
|
40
|
+
path, label = self.samples[idx]
|
|
41
|
+
img = Image.open(path).convert("RGB")
|
|
42
|
+
return self.transform(img), label
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
class DetectionDataset(Dataset):
|
|
46
|
+
"""Expects a COCO-lite JSON annotation file:
|
|
47
|
+
[
|
|
48
|
+
{"image": "img1.jpg", "boxes": [[x1,y1,x2,y2], ...], "labels": [0, 2, ...]},
|
|
49
|
+
...
|
|
50
|
+
]
|
|
51
|
+
"""
|
|
52
|
+
|
|
53
|
+
def __init__(self, root, annotation_file, image_size=512, transform=None):
|
|
54
|
+
self.root = root
|
|
55
|
+
with open(annotation_file) as f:
|
|
56
|
+
self.records = json.load(f)
|
|
57
|
+
self.image_size = image_size
|
|
58
|
+
self.transform = transform or T.Compose([
|
|
59
|
+
T.Resize((image_size, image_size)),
|
|
60
|
+
T.ToTensor(),
|
|
61
|
+
T.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
|
|
62
|
+
])
|
|
63
|
+
|
|
64
|
+
def __len__(self):
|
|
65
|
+
return len(self.records)
|
|
66
|
+
|
|
67
|
+
def __getitem__(self, idx):
|
|
68
|
+
rec = self.records[idx]
|
|
69
|
+
img = Image.open(os.path.join(self.root, rec["image"])).convert("RGB")
|
|
70
|
+
orig_w, orig_h = img.size
|
|
71
|
+
scale_x = self.image_size / orig_w
|
|
72
|
+
scale_y = self.image_size / orig_h
|
|
73
|
+
|
|
74
|
+
boxes = torch.tensor(rec["boxes"], dtype=torch.float32)
|
|
75
|
+
if len(boxes) > 0:
|
|
76
|
+
boxes[:, [0, 2]] *= scale_x
|
|
77
|
+
boxes[:, [1, 3]] *= scale_y
|
|
78
|
+
labels = torch.tensor(rec["labels"], dtype=torch.long)
|
|
79
|
+
|
|
80
|
+
img = self.transform(img)
|
|
81
|
+
return img, {"boxes": boxes, "labels": labels}
|
|
82
|
+
|
|
83
|
+
|
|
84
|
+
def detection_collate_fn(batch):
|
|
85
|
+
images = torch.stack([b[0] for b in batch])
|
|
86
|
+
targets = [b[1] for b in batch]
|
|
87
|
+
return images, targets
|
|
@@ -0,0 +1,34 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Anchor-free detection head, in the spirit of CenterNet/FCOS but simplified.
|
|
3
|
+
For each pyramid level it predicts, per spatial cell:
|
|
4
|
+
- heatmap: probability an object center lies here (per class)
|
|
5
|
+
- size: (w, h) of the object, regressed directly
|
|
6
|
+
- offset: sub-pixel (dx, dy) correction within the cell
|
|
7
|
+
|
|
8
|
+
This avoids hand-tuned anchor boxes entirely, which is one of the biggest
|
|
9
|
+
sources of friction/hyperparameter pain in classic detectors.
|
|
10
|
+
"""
|
|
11
|
+
|
|
12
|
+
import torch
|
|
13
|
+
import torch.nn as nn
|
|
14
|
+
from .layers import ConvBNAct
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
class DetectionHead(nn.Module):
|
|
18
|
+
def __init__(self, in_channels, num_classes):
|
|
19
|
+
super().__init__()
|
|
20
|
+
self.num_classes = num_classes
|
|
21
|
+
self.shared = nn.Sequential(
|
|
22
|
+
ConvBNAct(in_channels, in_channels, kernel_size=3),
|
|
23
|
+
ConvBNAct(in_channels, in_channels, kernel_size=3),
|
|
24
|
+
)
|
|
25
|
+
self.heatmap = nn.Conv2d(in_channels, num_classes, kernel_size=1)
|
|
26
|
+
self.size = nn.Conv2d(in_channels, 2, kernel_size=1)
|
|
27
|
+
self.offset = nn.Conv2d(in_channels, 2, kernel_size=1)
|
|
28
|
+
|
|
29
|
+
def forward(self, feat):
|
|
30
|
+
x = self.shared(feat)
|
|
31
|
+
heatmap = torch.sigmoid(self.heatmap(x))
|
|
32
|
+
size = self.size(x)
|
|
33
|
+
offset = torch.tanh(self.offset(x))
|
|
34
|
+
return {"heatmap": heatmap, "size": size, "offset": offset}
|
|
@@ -0,0 +1,87 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Core building blocks for NeuroVisionX.
|
|
3
|
+
|
|
4
|
+
AFPABlock (Adaptive Feature Pyramid Attention) is the signature module of
|
|
5
|
+
this library. It combines:
|
|
6
|
+
1. Channel attention (squeeze-excite style) to weight "what" matters.
|
|
7
|
+
2. Spatial attention (lightweight, single-conv) to weight "where" matters.
|
|
8
|
+
3. A learnable scalar gate that blends the attended features back with the
|
|
9
|
+
original input (residual gating), so the block can fall back to an
|
|
10
|
+
identity mapping early in training and only "turn on" attention once it
|
|
11
|
+
is useful. This stabilizes training on small/medium datasets compared to
|
|
12
|
+
dropping raw attention blocks into a network from scratch.
|
|
13
|
+
"""
|
|
14
|
+
|
|
15
|
+
import torch
|
|
16
|
+
import torch.nn as nn
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
class ConvBNAct(nn.Module):
|
|
20
|
+
"""Conv -> BatchNorm -> Activation, the basic unit reused everywhere."""
|
|
21
|
+
|
|
22
|
+
def __init__(self, in_ch, out_ch, kernel_size=3, stride=1, padding=None, act=True):
|
|
23
|
+
super().__init__()
|
|
24
|
+
if padding is None:
|
|
25
|
+
padding = kernel_size // 2
|
|
26
|
+
self.conv = nn.Conv2d(in_ch, out_ch, kernel_size, stride, padding, bias=False)
|
|
27
|
+
self.bn = nn.BatchNorm2d(out_ch)
|
|
28
|
+
self.act = nn.SiLU(inplace=True) if act else nn.Identity()
|
|
29
|
+
|
|
30
|
+
def forward(self, x):
|
|
31
|
+
return self.act(self.bn(self.conv(x)))
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
class ChannelAttention(nn.Module):
|
|
35
|
+
def __init__(self, channels, reduction=8):
|
|
36
|
+
super().__init__()
|
|
37
|
+
hidden = max(channels // reduction, 4)
|
|
38
|
+
self.pool = nn.AdaptiveAvgPool2d(1)
|
|
39
|
+
self.fc = nn.Sequential(
|
|
40
|
+
nn.Linear(channels, hidden),
|
|
41
|
+
nn.SiLU(inplace=True),
|
|
42
|
+
nn.Linear(hidden, channels),
|
|
43
|
+
nn.Sigmoid(),
|
|
44
|
+
)
|
|
45
|
+
|
|
46
|
+
def forward(self, x):
|
|
47
|
+
b, c, _, _ = x.shape
|
|
48
|
+
w = self.pool(x).view(b, c)
|
|
49
|
+
w = self.fc(w).view(b, c, 1, 1)
|
|
50
|
+
return x * w
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
class SpatialAttention(nn.Module):
|
|
54
|
+
def __init__(self, kernel_size=7):
|
|
55
|
+
super().__init__()
|
|
56
|
+
self.conv = nn.Conv2d(2, 1, kernel_size, padding=kernel_size // 2, bias=False)
|
|
57
|
+
self.sigmoid = nn.Sigmoid()
|
|
58
|
+
|
|
59
|
+
def forward(self, x):
|
|
60
|
+
avg_out = torch.mean(x, dim=1, keepdim=True)
|
|
61
|
+
max_out, _ = torch.max(x, dim=1, keepdim=True)
|
|
62
|
+
w = self.sigmoid(self.conv(torch.cat([avg_out, max_out], dim=1)))
|
|
63
|
+
return x * w
|
|
64
|
+
|
|
65
|
+
|
|
66
|
+
class AFPABlock(nn.Module):
|
|
67
|
+
"""Adaptive Feature Pyramid Attention block.
|
|
68
|
+
|
|
69
|
+
Args:
|
|
70
|
+
channels: number of input/output channels (kept equal for residual gating).
|
|
71
|
+
reduction: channel reduction ratio for the channel-attention branch.
|
|
72
|
+
"""
|
|
73
|
+
|
|
74
|
+
def __init__(self, channels, reduction=8):
|
|
75
|
+
super().__init__()
|
|
76
|
+
self.channel_att = ChannelAttention(channels, reduction)
|
|
77
|
+
self.spatial_att = SpatialAttention()
|
|
78
|
+
self.refine = ConvBNAct(channels, channels, kernel_size=3)
|
|
79
|
+
# learnable gate, starts near 0 -> behaves like identity at init
|
|
80
|
+
self.gate = nn.Parameter(torch.tensor(0.0))
|
|
81
|
+
|
|
82
|
+
def forward(self, x):
|
|
83
|
+
attended = self.channel_att(x)
|
|
84
|
+
attended = self.spatial_att(attended)
|
|
85
|
+
attended = self.refine(attended)
|
|
86
|
+
g = torch.sigmoid(self.gate)
|
|
87
|
+
return x + g * (attended - x)
|
|
@@ -0,0 +1,58 @@
|
|
|
1
|
+
"""
|
|
2
|
+
NeuroVisionNet: the user-facing model class.
|
|
3
|
+
|
|
4
|
+
mode="classify" -> only classification head is active
|
|
5
|
+
mode="detect" -> only detection head is active
|
|
6
|
+
mode="both" -> both heads are computed every forward pass (multi-task)
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
import torch
|
|
10
|
+
import torch.nn as nn
|
|
11
|
+
from .backbone import NeuroVisionBackbone
|
|
12
|
+
from .detection_head import DetectionHead
|
|
13
|
+
from .utils import decode_heatmap
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
class NeuroVisionNet(nn.Module):
|
|
17
|
+
def __init__(self, num_classes, mode="both", in_channels=3):
|
|
18
|
+
super().__init__()
|
|
19
|
+
assert mode in ("classify", "detect", "both")
|
|
20
|
+
self.mode = mode
|
|
21
|
+
self.num_classes = num_classes
|
|
22
|
+
self.backbone = NeuroVisionBackbone(in_channels=in_channels)
|
|
23
|
+
|
|
24
|
+
c5 = self.backbone.out_channels["C5"]
|
|
25
|
+
c4 = self.backbone.out_channels["C4"]
|
|
26
|
+
|
|
27
|
+
if mode in ("classify", "both"):
|
|
28
|
+
self.classifier = nn.Sequential(
|
|
29
|
+
nn.AdaptiveAvgPool2d(1),
|
|
30
|
+
nn.Flatten(),
|
|
31
|
+
nn.Linear(c5, num_classes),
|
|
32
|
+
)
|
|
33
|
+
if mode in ("detect", "both"):
|
|
34
|
+
self.det_head = DetectionHead(c4, num_classes)
|
|
35
|
+
# project C5 down and upsample to fuse with C4 for slightly richer features
|
|
36
|
+
self.fuse = nn.Conv2d(c5, c4, kernel_size=1)
|
|
37
|
+
|
|
38
|
+
def forward(self, x):
|
|
39
|
+
feats = self.backbone(x)
|
|
40
|
+
out = {}
|
|
41
|
+
if self.mode in ("classify", "both"):
|
|
42
|
+
out["class_logits"] = self.classifier(feats["C5"])
|
|
43
|
+
if self.mode in ("detect", "both"):
|
|
44
|
+
up = nn.functional.interpolate(
|
|
45
|
+
self.fuse(feats["C5"]), size=feats["C4"].shape[-2:], mode="nearest"
|
|
46
|
+
)
|
|
47
|
+
fused = feats["C4"] + up
|
|
48
|
+
out["detections"] = self.det_head(fused)
|
|
49
|
+
return out
|
|
50
|
+
|
|
51
|
+
@torch.no_grad()
|
|
52
|
+
def detect(self, image_tensor, score_thresh=0.3, max_dets=100):
|
|
53
|
+
"""Convenience inference method. image_tensor: (1, C, H, W) normalized."""
|
|
54
|
+
self.eval()
|
|
55
|
+
out = self.forward(image_tensor)
|
|
56
|
+
if "detections" not in out:
|
|
57
|
+
raise RuntimeError("Model was not built with a detection head (mode='classify').")
|
|
58
|
+
return decode_heatmap(out["detections"], image_tensor.shape[-2:], score_thresh, max_dets)
|
|
@@ -0,0 +1,100 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Trainer: a single class that handles training for classify / detect / both
|
|
3
|
+
modes, including a focal-loss-style heatmap loss for detection, mixed with
|
|
4
|
+
classification cross-entropy when mode="both" is used jointly with a labeled
|
|
5
|
+
classification dataset (advanced use). For most users, train classification
|
|
6
|
+
and detection separately with two Trainer instances on two datasets.
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
import torch
|
|
10
|
+
import torch.nn as nn
|
|
11
|
+
import torch.nn.functional as F
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
def focal_heatmap_loss(pred, target, alpha=2, beta=4):
|
|
15
|
+
pos_mask = (target == 1).float()
|
|
16
|
+
neg_mask = (target < 1).float()
|
|
17
|
+
pos_loss = -((1 - pred) ** alpha) * torch.log(pred.clamp(min=1e-6)) * pos_mask
|
|
18
|
+
neg_loss = -((1 - target) ** beta) * (pred ** alpha) * torch.log((1 - pred).clamp(min=1e-6)) * neg_mask
|
|
19
|
+
num_pos = pos_mask.sum().clamp(min=1)
|
|
20
|
+
return (pos_loss.sum() + neg_loss.sum()) / num_pos
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
def build_heatmap_target(targets, heatmap_shape, image_size, num_classes, device):
|
|
24
|
+
"""Builds a Gaussian heatmap target tensor from box annotations (simplified)."""
|
|
25
|
+
b, _, h, w = heatmap_shape
|
|
26
|
+
target = torch.zeros(b, num_classes, h, w, device=device)
|
|
27
|
+
stride_y = image_size[0] / h
|
|
28
|
+
stride_x = image_size[1] / w
|
|
29
|
+
for i, t in enumerate(targets):
|
|
30
|
+
boxes, labels = t["boxes"], t["labels"]
|
|
31
|
+
for box, label in zip(boxes, labels):
|
|
32
|
+
x1, y1, x2, y2 = box.tolist()
|
|
33
|
+
cx, cy = (x1 + x2) / 2 / stride_x, (y1 + y2) / 2 / stride_y
|
|
34
|
+
xi, yi = int(cx), int(cy)
|
|
35
|
+
if 0 <= xi < w and 0 <= yi < h:
|
|
36
|
+
target[i, int(label), yi, xi] = 1.0
|
|
37
|
+
return target
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
class Trainer:
|
|
41
|
+
def __init__(self, model, train_loader, val_loader=None, lr=1e-3,
|
|
42
|
+
device=None, task="classify"):
|
|
43
|
+
"""
|
|
44
|
+
task: "classify" or "detect" — which loss to compute on this loader.
|
|
45
|
+
"""
|
|
46
|
+
self.model = model
|
|
47
|
+
self.train_loader = train_loader
|
|
48
|
+
self.val_loader = val_loader
|
|
49
|
+
self.task = task
|
|
50
|
+
self.device = device or ("cuda" if torch.cuda.is_available() else "cpu")
|
|
51
|
+
self.model.to(self.device)
|
|
52
|
+
self.optimizer = torch.optim.AdamW(self.model.parameters(), lr=lr)
|
|
53
|
+
|
|
54
|
+
def fit(self, epochs=10, log_every=50):
|
|
55
|
+
for epoch in range(1, epochs + 1):
|
|
56
|
+
self.model.train()
|
|
57
|
+
running_loss = 0.0
|
|
58
|
+
for step, batch in enumerate(self.train_loader):
|
|
59
|
+
loss = self._step(batch)
|
|
60
|
+
self.optimizer.zero_grad()
|
|
61
|
+
loss.backward()
|
|
62
|
+
self.optimizer.step()
|
|
63
|
+
running_loss += loss.item()
|
|
64
|
+
if step % log_every == 0:
|
|
65
|
+
print(f"[epoch {epoch}] step {step} loss {loss.item():.4f}")
|
|
66
|
+
|
|
67
|
+
avg = running_loss / max(1, len(self.train_loader))
|
|
68
|
+
msg = f"Epoch {epoch}/{epochs} - avg train loss: {avg:.4f}"
|
|
69
|
+
if self.val_loader is not None:
|
|
70
|
+
val_loss = self.evaluate()
|
|
71
|
+
msg += f" - val loss: {val_loss:.4f}"
|
|
72
|
+
print(msg)
|
|
73
|
+
|
|
74
|
+
def _step(self, batch):
|
|
75
|
+
if self.task == "classify":
|
|
76
|
+
images, labels = batch
|
|
77
|
+
images, labels = images.to(self.device), labels.to(self.device)
|
|
78
|
+
out = self.model(images)
|
|
79
|
+
return F.cross_entropy(out["class_logits"], labels)
|
|
80
|
+
|
|
81
|
+
elif self.task == "detect":
|
|
82
|
+
images, targets = batch
|
|
83
|
+
images = images.to(self.device)
|
|
84
|
+
targets = [{k: v.to(self.device) for k, v in t.items()} for t in targets]
|
|
85
|
+
out = self.model(images)["detections"]
|
|
86
|
+
heatmap_target = build_heatmap_target(
|
|
87
|
+
targets, out["heatmap"].shape, images.shape[-2:],
|
|
88
|
+
self.model.num_classes, self.device,
|
|
89
|
+
)
|
|
90
|
+
return focal_heatmap_loss(out["heatmap"], heatmap_target)
|
|
91
|
+
|
|
92
|
+
raise ValueError(f"Unknown task: {self.task}")
|
|
93
|
+
|
|
94
|
+
@torch.no_grad()
|
|
95
|
+
def evaluate(self):
|
|
96
|
+
self.model.eval()
|
|
97
|
+
total = 0.0
|
|
98
|
+
for batch in self.val_loader:
|
|
99
|
+
total += self._step(batch).item()
|
|
100
|
+
return total / max(1, len(self.val_loader))
|
|
@@ -0,0 +1,86 @@
|
|
|
1
|
+
"""Shared utility functions: geometry, decoding, visualization."""
|
|
2
|
+
|
|
3
|
+
import torch
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
def iou(box1, box2):
|
|
7
|
+
"""box format: [x1, y1, x2, y2]. Returns scalar IoU."""
|
|
8
|
+
x1 = max(box1[0], box2[0])
|
|
9
|
+
y1 = max(box1[1], box2[1])
|
|
10
|
+
x2 = min(box1[2], box2[2])
|
|
11
|
+
y2 = min(box1[3], box2[3])
|
|
12
|
+
inter = max(0, x2 - x1) * max(0, y2 - y1)
|
|
13
|
+
area1 = (box1[2] - box1[0]) * (box1[3] - box1[1])
|
|
14
|
+
area2 = (box2[2] - box2[0]) * (box2[3] - box2[1])
|
|
15
|
+
union = area1 + area2 - inter
|
|
16
|
+
return inter / union if union > 0 else 0.0
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
def nms(boxes, scores, iou_thresh=0.5):
|
|
20
|
+
"""Standard greedy NMS. boxes: list/tensor of [x1,y1,x2,y2], scores: list/tensor."""
|
|
21
|
+
if len(boxes) == 0:
|
|
22
|
+
return []
|
|
23
|
+
boxes = torch.as_tensor(boxes, dtype=torch.float32)
|
|
24
|
+
scores = torch.as_tensor(scores, dtype=torch.float32)
|
|
25
|
+
order = scores.argsort(descending=True)
|
|
26
|
+
keep = []
|
|
27
|
+
while order.numel() > 0:
|
|
28
|
+
i = order[0].item()
|
|
29
|
+
keep.append(i)
|
|
30
|
+
if order.numel() == 1:
|
|
31
|
+
break
|
|
32
|
+
rest = order[1:]
|
|
33
|
+
ious = torch.tensor([iou(boxes[i].tolist(), boxes[j].tolist()) for j in rest])
|
|
34
|
+
order = rest[ious <= iou_thresh]
|
|
35
|
+
return keep
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
def decode_heatmap(det_out, image_size, score_thresh=0.3, max_dets=100):
|
|
39
|
+
"""Convert raw detection head output into a list of (box, label, score)."""
|
|
40
|
+
heatmap = det_out["heatmap"][0] # (num_classes, h, w)
|
|
41
|
+
size = det_out["size"][0] # (2, h, w)
|
|
42
|
+
offset = det_out["offset"][0] # (2, h, w)
|
|
43
|
+
|
|
44
|
+
num_classes, h, w = heatmap.shape
|
|
45
|
+
stride_y = image_size[0] / h
|
|
46
|
+
stride_x = image_size[1] / w
|
|
47
|
+
|
|
48
|
+
scores, classes = heatmap.max(dim=0)
|
|
49
|
+
flat_scores = scores.flatten()
|
|
50
|
+
topk = min(max_dets, flat_scores.numel())
|
|
51
|
+
top_scores, top_idx = flat_scores.topk(topk)
|
|
52
|
+
|
|
53
|
+
results = []
|
|
54
|
+
for score, idx in zip(top_scores.tolist(), top_idx.tolist()):
|
|
55
|
+
if score < score_thresh:
|
|
56
|
+
continue
|
|
57
|
+
yy, xx = divmod(idx, w)
|
|
58
|
+
cls = classes[yy, xx].item()
|
|
59
|
+
dx, dy = offset[0, yy, xx].item(), offset[1, yy, xx].item()
|
|
60
|
+
bw, bh = size[0, yy, xx].item(), size[1, yy, xx].item()
|
|
61
|
+
cx = (xx + 0.5 + dx) * stride_x
|
|
62
|
+
cy = (yy + 0.5 + dy) * stride_y
|
|
63
|
+
box = [cx - bw / 2, cy - bh / 2, cx + bw / 2, cy + bh / 2]
|
|
64
|
+
results.append({"box": box, "label": cls, "score": score})
|
|
65
|
+
|
|
66
|
+
if results:
|
|
67
|
+
keep = nms([r["box"] for r in results], [r["score"] for r in results])
|
|
68
|
+
results = [results[i] for i in keep]
|
|
69
|
+
return results
|
|
70
|
+
|
|
71
|
+
|
|
72
|
+
def visualize_detections(image, detections, class_names=None, save_path=None):
|
|
73
|
+
"""Draw boxes on a PIL image using only stdlib-adjacent deps (Pillow)."""
|
|
74
|
+
from PIL import ImageDraw
|
|
75
|
+
|
|
76
|
+
img = image.copy()
|
|
77
|
+
draw = ImageDraw.Draw(img)
|
|
78
|
+
for det in detections:
|
|
79
|
+
x1, y1, x2, y2 = det["box"]
|
|
80
|
+
label = det["label"]
|
|
81
|
+
name = class_names[label] if class_names else str(label)
|
|
82
|
+
draw.rectangle([x1, y1, x2, y2], outline="red", width=2)
|
|
83
|
+
draw.text((x1, max(0, y1 - 10)), f"{name} {det['score']:.2f}", fill="red")
|
|
84
|
+
if save_path:
|
|
85
|
+
img.save(save_path)
|
|
86
|
+
return img
|
|
@@ -0,0 +1,203 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: neurovisionx
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Unified CNN library for image classification and anchor-free object detection with Adaptive Feature Pyramid Attention (AFPA).
|
|
5
|
+
Author-email: Srikanth Sridhar <srisrikanthtvs@gmail.com>
|
|
6
|
+
License: MIT
|
|
7
|
+
Project-URL: Homepage, https://github.com/sricodings/neurovisionx
|
|
8
|
+
Project-URL: Documentation, https://github.com/sricodings/neurovisionx#readme
|
|
9
|
+
Project-URL: Issues, https://github.com/sricodings/neurovisionx/issues
|
|
10
|
+
Keywords: deep learning,computer vision,object detection,image classification,pytorch,cnn,attention
|
|
11
|
+
Classifier: Development Status :: 4 - Beta
|
|
12
|
+
Classifier: Intended Audience :: Developers
|
|
13
|
+
Classifier: Intended Audience :: Science/Research
|
|
14
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
15
|
+
Classifier: Programming Language :: Python :: 3
|
|
16
|
+
Classifier: Programming Language :: Python :: 3.8
|
|
17
|
+
Classifier: Programming Language :: Python :: 3.9
|
|
18
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
19
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
20
|
+
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
|
21
|
+
Classifier: Topic :: Scientific/Engineering :: Image Recognition
|
|
22
|
+
Requires-Python: >=3.8
|
|
23
|
+
Description-Content-Type: text/markdown
|
|
24
|
+
License-File: LICENSE
|
|
25
|
+
Requires-Dist: torch>=1.13
|
|
26
|
+
Requires-Dist: torchvision>=0.14
|
|
27
|
+
Requires-Dist: pillow>=9.0
|
|
28
|
+
Provides-Extra: dev
|
|
29
|
+
Requires-Dist: pytest; extra == "dev"
|
|
30
|
+
Requires-Dist: black; extra == "dev"
|
|
31
|
+
Requires-Dist: flake8; extra == "dev"
|
|
32
|
+
Requires-Dist: build; extra == "dev"
|
|
33
|
+
Requires-Dist: twine; extra == "dev"
|
|
34
|
+
Dynamic: license-file
|
|
35
|
+
|
|
36
|
+
# NeuroVisionX
|
|
37
|
+
|
|
38
|
+
**A unified PyTorch library for image classification and anchor-free object detection**,
|
|
39
|
+
built around a single shared backbone and a custom attention module: **AFPA (Adaptive
|
|
40
|
+
Feature Pyramid Attention)**.
|
|
41
|
+
|
|
42
|
+
[](https://pypi.org/project/neurovisionx/)
|
|
43
|
+
[](LICENSE)
|
|
44
|
+
[](https://www.python.org/)
|
|
45
|
+
|
|
46
|
+
---
|
|
47
|
+
|
|
48
|
+
## Why NeuroVisionX
|
|
49
|
+
|
|
50
|
+
Most beginner-friendly libraries make you choose: a simple classifier *or* a heavy,
|
|
51
|
+
hard-to-configure detector (YOLO/SSD-style with anchor boxes and many hyperparameters).
|
|
52
|
+
|
|
53
|
+
NeuroVisionX gives you **both, from one backbone**:
|
|
54
|
+
|
|
55
|
+
- **One backbone, two heads.** Train a classifier, a detector, or both, without
|
|
56
|
+
duplicating feature extraction code.
|
|
57
|
+
- **AFPA attention block.** Lightweight channel+spatial attention with a learnable
|
|
58
|
+
residual gate, so it starts as identity and "switches on" only when useful —
|
|
59
|
+
more stable than dropping raw attention into a network from scratch.
|
|
60
|
+
- **Anchor-free detection head.** Predicts object centers, sizes, and sub-pixel
|
|
61
|
+
offsets directly (CenterNet/FCOS-style) — no anchor box tuning.
|
|
62
|
+
- **One `Trainer` class** for both tasks, with sensible defaults.
|
|
63
|
+
- **Minimal dependencies**: `torch`, `torchvision`, `pillow`. Nothing exotic.
|
|
64
|
+
|
|
65
|
+
## Installation
|
|
66
|
+
|
|
67
|
+
```bash
|
|
68
|
+
pip install neurovisionx
|
|
69
|
+
```
|
|
70
|
+
|
|
71
|
+
For local development:
|
|
72
|
+
|
|
73
|
+
```bash
|
|
74
|
+
git clone https://github.com/sricodings/neurovisionx.git
|
|
75
|
+
cd neurovisionx
|
|
76
|
+
pip install -e ".[dev]"
|
|
77
|
+
```
|
|
78
|
+
|
|
79
|
+
## Quick start
|
|
80
|
+
|
|
81
|
+
### Classification
|
|
82
|
+
|
|
83
|
+
```python
|
|
84
|
+
import neurovisionx as nv
|
|
85
|
+
from torch.utils.data import DataLoader
|
|
86
|
+
|
|
87
|
+
train_ds = nv.ImageFolderDataset("data/train", image_size=224)
|
|
88
|
+
train_loader = DataLoader(train_ds, batch_size=32, shuffle=True)
|
|
89
|
+
|
|
90
|
+
model = nv.NeuroVisionXNet(num_classes=len(train_ds.classes), mode="classify")
|
|
91
|
+
trainer = nv.Trainer(model, train_loader, task="classify")
|
|
92
|
+
trainer.fit(epochs=20)
|
|
93
|
+
```
|
|
94
|
+
|
|
95
|
+
### Object detection
|
|
96
|
+
|
|
97
|
+
```python
|
|
98
|
+
import neurovisionx as nv
|
|
99
|
+
from torch.utils.data import DataLoader
|
|
100
|
+
|
|
101
|
+
train_ds = nv.DetectionDataset("data/images", "data/annotations.json", image_size=512)
|
|
102
|
+
train_loader = DataLoader(train_ds, batch_size=8, shuffle=True,
|
|
103
|
+
collate_fn=nv.dataset.detection_collate_fn)
|
|
104
|
+
|
|
105
|
+
model = nv.NeuroVisionXNet(num_classes=5, mode="detect")
|
|
106
|
+
trainer = nv.Trainer(model, train_loader, task="detect")
|
|
107
|
+
trainer.fit(epochs=30)
|
|
108
|
+
|
|
109
|
+
detections = model.detect(image_tensor, score_thresh=0.4)
|
|
110
|
+
```
|
|
111
|
+
|
|
112
|
+
Annotation format (`annotations.json`) — simple COCO-lite JSON:
|
|
113
|
+
|
|
114
|
+
```json
|
|
115
|
+
[
|
|
116
|
+
{"image": "img1.jpg", "boxes": [[34, 12, 200, 180]], "labels": [0]},
|
|
117
|
+
{"image": "img2.jpg", "boxes": [[10, 10, 50, 50], [60, 60, 120, 140]], "labels": [2, 4]}
|
|
118
|
+
]
|
|
119
|
+
```
|
|
120
|
+
|
|
121
|
+
## Architecture overview
|
|
122
|
+
|
|
123
|
+
```
|
|
124
|
+
Input image
|
|
125
|
+
│
|
|
126
|
+
▼
|
|
127
|
+
Stem conv
|
|
128
|
+
│
|
|
129
|
+
▼
|
|
130
|
+
Stage 1 (downsample + AFPA) ─────────────┐
|
|
131
|
+
│ │
|
|
132
|
+
▼ │
|
|
133
|
+
Stage 2 (downsample + AFPA) -> C3 │ multi-scale
|
|
134
|
+
│ │ feature maps
|
|
135
|
+
▼ │
|
|
136
|
+
Stage 3 (downsample + AFPA) -> C4 ──┐ │
|
|
137
|
+
│ │ │
|
|
138
|
+
▼ │ │
|
|
139
|
+
Stage 4 (downsample + AFPA) -> C5 │ │
|
|
140
|
+
│ │ │
|
|
141
|
+
├──> Classifier head (GAP + FC) │ │
|
|
142
|
+
│ ▼ │
|
|
143
|
+
└──> 1x1 conv + upsample ──> fuse with C4
|
|
144
|
+
│
|
|
145
|
+
▼
|
|
146
|
+
Anchor-free detection head
|
|
147
|
+
(heatmap, size, offset maps)
|
|
148
|
+
```
|
|
149
|
+
|
|
150
|
+
## Repository layout
|
|
151
|
+
|
|
152
|
+
```
|
|
153
|
+
neurovisionx/
|
|
154
|
+
├── neurovisionx/
|
|
155
|
+
│ ├── __init__.py # public API
|
|
156
|
+
│ ├── layers.py # ConvBNAct, AFPABlock (the novel attention module)
|
|
157
|
+
│ ├── backbone.py # multi-stage CNN backbone
|
|
158
|
+
│ ├── detection_head.py # anchor-free detection head
|
|
159
|
+
│ ├── model.py # NeuroVisionXNet (combines backbone + heads)
|
|
160
|
+
│ ├── dataset.py # ImageFolderDataset, DetectionDataset
|
|
161
|
+
│ ├── trainer.py # Trainer (training loop, losses)
|
|
162
|
+
│ └── utils.py # IoU, NMS, heatmap decoding, visualization
|
|
163
|
+
├── examples/
|
|
164
|
+
│ ├── train_classifier.py
|
|
165
|
+
│ └── train_detector.py
|
|
166
|
+
├── tests/
|
|
167
|
+
├── docs/
|
|
168
|
+
│ └── PUBLISHING_GUIDE.md
|
|
169
|
+
├── pyproject.toml
|
|
170
|
+
├── LICENSE
|
|
171
|
+
└── README.md
|
|
172
|
+
```
|
|
173
|
+
|
|
174
|
+
## Roadmap (suggested, fill in as you build)
|
|
175
|
+
|
|
176
|
+
- [ ] Pretrained backbone weights (ImageNet-style pretraining)
|
|
177
|
+
- [ ] ONNX export for deployment
|
|
178
|
+
- [ ] Mixed-precision training support
|
|
179
|
+
- [ ] Data augmentation pipeline (mosaic, mixup, random crop)
|
|
180
|
+
- [ ] Benchmark results vs ResNet/YOLO baselines on COCO subset
|
|
181
|
+
- [ ] Sphinx-generated documentation site
|
|
182
|
+
|
|
183
|
+
## Contributing
|
|
184
|
+
|
|
185
|
+
Pull requests are welcome. Please open an issue first to discuss major changes.
|
|
186
|
+
Run `pytest` and `black .` before submitting.
|
|
187
|
+
|
|
188
|
+
## Citing
|
|
189
|
+
|
|
190
|
+
If you use NeuroVisionX in research, please cite:
|
|
191
|
+
|
|
192
|
+
```bibtex
|
|
193
|
+
@software{neurovisionx2026,
|
|
194
|
+
author = {YOUR NAME},
|
|
195
|
+
title = {NeuroVisionX: Unified Classification and Anchor-Free Detection with Adaptive Feature Pyramid Attention},
|
|
196
|
+
year = {2026},
|
|
197
|
+
url = {https://github.com/YOUR_USERNAME/neurovisionx}
|
|
198
|
+
}
|
|
199
|
+
```
|
|
200
|
+
|
|
201
|
+
## License
|
|
202
|
+
|
|
203
|
+
MIT License — see [LICENSE](LICENSE).
|
|
@@ -0,0 +1,16 @@
|
|
|
1
|
+
LICENSE
|
|
2
|
+
README.md
|
|
3
|
+
pyproject.toml
|
|
4
|
+
neurovisionx/__init__.py
|
|
5
|
+
neurovisionx/backbone.py
|
|
6
|
+
neurovisionx/dataset.py
|
|
7
|
+
neurovisionx/detection_head.py
|
|
8
|
+
neurovisionx/layers.py
|
|
9
|
+
neurovisionx/model.py
|
|
10
|
+
neurovisionx/trainer.py
|
|
11
|
+
neurovisionx/utils.py
|
|
12
|
+
neurovisionx.egg-info/PKG-INFO
|
|
13
|
+
neurovisionx.egg-info/SOURCES.txt
|
|
14
|
+
neurovisionx.egg-info/dependency_links.txt
|
|
15
|
+
neurovisionx.egg-info/requires.txt
|
|
16
|
+
neurovisionx.egg-info/top_level.txt
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
neurovisionx
|
|
@@ -0,0 +1,44 @@
|
|
|
1
|
+
[build-system]
|
|
2
|
+
requires = ["setuptools>=68.0", "wheel"]
|
|
3
|
+
build-backend = "setuptools.build_meta"
|
|
4
|
+
|
|
5
|
+
[project]
|
|
6
|
+
name = "neurovisionx"
|
|
7
|
+
version = "0.1.0"
|
|
8
|
+
description = "Unified CNN library for image classification and anchor-free object detection with Adaptive Feature Pyramid Attention (AFPA)."
|
|
9
|
+
readme = "README.md"
|
|
10
|
+
requires-python = ">=3.8"
|
|
11
|
+
license = {text = "MIT"}
|
|
12
|
+
authors = [
|
|
13
|
+
{ name = "Srikanth Sridhar", email = "srisrikanthtvs@gmail.com" }
|
|
14
|
+
]
|
|
15
|
+
keywords = ["deep learning", "computer vision", "object detection", "image classification", "pytorch", "cnn", "attention"]
|
|
16
|
+
classifiers = [
|
|
17
|
+
"Development Status :: 4 - Beta",
|
|
18
|
+
"Intended Audience :: Developers",
|
|
19
|
+
"Intended Audience :: Science/Research",
|
|
20
|
+
"License :: OSI Approved :: MIT License",
|
|
21
|
+
"Programming Language :: Python :: 3",
|
|
22
|
+
"Programming Language :: Python :: 3.8",
|
|
23
|
+
"Programming Language :: Python :: 3.9",
|
|
24
|
+
"Programming Language :: Python :: 3.10",
|
|
25
|
+
"Programming Language :: Python :: 3.11",
|
|
26
|
+
"Topic :: Scientific/Engineering :: Artificial Intelligence",
|
|
27
|
+
"Topic :: Scientific/Engineering :: Image Recognition",
|
|
28
|
+
]
|
|
29
|
+
dependencies = [
|
|
30
|
+
"torch>=1.13",
|
|
31
|
+
"torchvision>=0.14",
|
|
32
|
+
"pillow>=9.0",
|
|
33
|
+
]
|
|
34
|
+
|
|
35
|
+
[project.urls]
|
|
36
|
+
Homepage = "https://github.com/sricodings/neurovisionx"
|
|
37
|
+
Documentation = "https://github.com/sricodings/neurovisionx#readme"
|
|
38
|
+
Issues = "https://github.com/sricodings/neurovisionx/issues"
|
|
39
|
+
|
|
40
|
+
[project.optional-dependencies]
|
|
41
|
+
dev = ["pytest", "black", "flake8", "build", "twine"]
|
|
42
|
+
|
|
43
|
+
[tool.setuptools.packages.find]
|
|
44
|
+
include = ["neurovisionx*"]
|