pyrobovision 0.5.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,229 @@
1
+ Metadata-Version: 2.4
2
+ Name: pyrobovision
3
+ Version: 0.5.0
4
+ Summary: Advanced autonomous driving perception and vision-language foundation models for robotics
5
+ Author-email: Georgi Mammen Mullassery <mullassery@gmail.com>
6
+ License: MIT
7
+ Project-URL: Homepage, https://github.com/Mullassery/PyRoboVision
8
+ Project-URL: Documentation, https://github.com/Mullassery/PyRoboVision/blob/main/README.md
9
+ Project-URL: Repository, https://github.com/Mullassery/PyRoboVision
10
+ Project-URL: Bug Tracker, https://github.com/Mullassery/PyRoboVision/issues
11
+ Keywords: robotics,autonomous-driving,perception,computer-vision,foundation-models,sam3,clip,grounding-dino
12
+ Classifier: Development Status :: 4 - Beta
13
+ Classifier: Intended Audience :: Developers
14
+ Classifier: Intended Audience :: Science/Research
15
+ Classifier: License :: OSI Approved :: MIT License
16
+ Classifier: Operating System :: OS Independent
17
+ Classifier: Programming Language :: Python :: 3
18
+ Classifier: Programming Language :: Python :: 3.10
19
+ Classifier: Programming Language :: Python :: 3.11
20
+ Classifier: Programming Language :: Python :: 3.12
21
+ Classifier: Programming Language :: Python :: 3.13
22
+ Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
23
+ Classifier: Topic :: Scientific/Engineering :: Image Recognition
24
+ Requires-Python: >=3.10
25
+ Description-Content-Type: text/markdown
26
+ Requires-Dist: pyroboframes>=1.0.0
27
+ Requires-Dist: numpy>=1.24.0
28
+ Requires-Dist: torch>=2.0.0
29
+ Requires-Dist: torchvision>=0.15.0
30
+ Requires-Dist: transformers>=4.30.0
31
+ Requires-Dist: scipy>=1.10.0
32
+ Requires-Dist: open3d>=0.17.0
33
+ Provides-Extra: dev
34
+ Requires-Dist: pytest>=7.4.0; extra == "dev"
35
+ Requires-Dist: pytest-cov>=4.1.0; extra == "dev"
36
+ Requires-Dist: black>=23.0.0; extra == "dev"
37
+ Requires-Dist: isort>=5.12.0; extra == "dev"
38
+ Requires-Dist: mypy>=1.4.0; extra == "dev"
39
+ Requires-Dist: ruff>=0.0.285; extra == "dev"
40
+ Provides-Extra: cuda
41
+ Requires-Dist: cupy>=12.0.0; extra == "cuda"
42
+ Requires-Dist: torch[cuda11x]>=2.0.0; extra == "cuda"
43
+ Provides-Extra: mlx
44
+ Requires-Dist: mlx>=0.0.13; extra == "mlx"
45
+
46
+ # PyRoboVision
47
+
48
+ Advanced autonomous driving perception and vision-language foundation models for robotics. Built on top of [PyRoboFrames](https://github.com/Mullassery/PyRoboFrames).
49
+
50
+ **Focus:** Advanced perception pipelines and multimodal understanding that consume data loaded by PyRoboFrames.
51
+
52
+ ---
53
+
54
+ ## What's Inside
55
+
56
+ ### Autonomous Driving (v0.5)
57
+ - **Cylindrical panoramic stitching** — 360° multi-camera fusion (Waymo, nuScenes)
58
+ - **Advanced blending** — Laplacian pyramid + graph-cut seams
59
+ - **Bird's-eye-view (BEV)** — 3D projection for autonomous perception
60
+ - **GPU acceleration** — CuPy (NVIDIA), MLX (Apple Silicon), NumPy (CPU)
61
+ - **Sensor fusion** — Lidar/Radar + occupancy grid mapping
62
+ - **Dataset loaders** — Waymo TFRecord, nuScenes JSON, KITTI stereo
63
+
64
+ ### Foundation Models (Phase 7)
65
+ - **SAM3 segmentation** — Instance segmentation + temporal tracking
66
+ - **CLIP embeddings** — Scene understanding, text-image similarity
67
+ - **Grounding DINO** — Open-vocabulary object detection
68
+ - **Multi-modal fusion** — Unified detection + segmentation + classification
69
+
70
+ ---
71
+
72
+ ## Installation
73
+
74
+ ```bash
75
+ # Requires PyRoboFrames v1.0+
76
+ pip install pyroboframes pyroboframes-vision
77
+
78
+ # From source
79
+ git clone https://github.com/Mullassery/PyRoboVision.git
80
+ cd PyRoboVision
81
+ pip install -e .
82
+ ```
83
+
84
+ ---
85
+
86
+ ## Quick Start
87
+
88
+ ### Autonomous Driving: 360° Panoramic Perception
89
+
90
+ ```python
91
+ from pyrobovision.automotive import (
92
+ CylindricalStitcher,
93
+ get_waymo_layout,
94
+ )
95
+
96
+ # Stitch 5 cameras into 360° panorama
97
+ layout = get_waymo_layout()
98
+ stitcher = CylindricalStitcher(layout, blend_method="laplacian")
99
+
100
+ frames = {
101
+ "FRONT": ...,
102
+ "FRONT_LEFT": ...,
103
+ # ... other cameras
104
+ }
105
+
106
+ panorama = stitcher.stitch(frames) # [1, H, W, 3] seamless 360°
107
+ ```
108
+
109
+ ### Foundation Models: Multi-Modal Scene Understanding
110
+
111
+ ```python
112
+ from pyrobovision.foundation_models import MultiModalFusion
113
+
114
+ fusion = MultiModalFusion(
115
+ detection_prompt="car . pedestrian . cyclist",
116
+ device="mlx", # or "cuda"
117
+ )
118
+
119
+ scene = fusion.understand(frame)
120
+ for obj in scene.objects:
121
+ print(f"{obj.object_class}: {obj.semantic_label}")
122
+ ```
123
+
124
+ ---
125
+
126
+ ## Architecture
127
+
128
+ ### Dependency Graph
129
+
130
+ ```
131
+ PyRoboVision/
132
+ ├── automotive/ # v0.5 AV perception
133
+ │ ├── stitching.py
134
+ │ ├── blending.py
135
+ │ ├── bev.py
136
+ │ ├── perception_3d.py
137
+ │ ├── tfrecord_utils.py
138
+ │ ├── nuscenes_utils.py
139
+ │ └── datasets.py
140
+
141
+ └── foundation_models/ # Phase 7
142
+ ├── sam3_segmentation.py
143
+ ├── clip_embeddings.py
144
+ ├── grounding_dino.py
145
+ └── multimodal_fusion.py
146
+
147
+ ↓ Depends on PyRoboFrames v1.0+ (dataloader)
148
+ PyRoboFrames/
149
+ ├── RoboFrameDataset # Load LeRobot
150
+ ├── ProprioceptiveLoader # Load state/action
151
+ ├── DataLoader # Device selection
152
+ └── [video decode, sensor fusion, etc.]
153
+ ```
154
+
155
+ **Key design:** PyRoboVision is a consumer library, not a foundation. It uses PyRoboFrames to load data, then applies perception algorithms.
156
+
157
+ ---
158
+
159
+ ## Features
160
+
161
+ | Phase | Feature | Status | Tests |
162
+ |-------|---------|--------|-------|
163
+ | **1** | Cylindrical panoramic projection | ✅ | 10 |
164
+ | **2** | Laplacian pyramid blending | ✅ | 5 |
165
+ | **3** | Bird's-eye-view (BEV) projection | ✅ | 5 |
166
+ | **4a** | GPU acceleration (CuPy/MLX/NumPy) | ✅ | 6 |
167
+ | **4b** | Optical flow seam tracking | ✅ | 10 |
168
+ | **5** | Waymo/nuScenes/KITTI loaders | ✅ | 9 |
169
+ | **6** | Lidar/Radar fusion + Occupancy grids | ✅ | 18 |
170
+ | **7a** | SAM3 temporal segmentation | ✅ | 18 |
171
+ | **7b** | CLIP scene embeddings | ✅ | 25 |
172
+ | **7c** | Grounding DINO detection | ✅ | 26 |
173
+ | **7d** | Multi-modal fusion | ✅ | 17 |
174
+
175
+ **Total: 149 tests, all passing**
176
+
177
+ ---
178
+
179
+ ## Use Cases
180
+
181
+ ### Autonomous Driving
182
+ - Waymo perception pipeline (panoramic stitching + 3D fusion)
183
+ - nuScenes multi-camera understanding
184
+ - Real-time BEV mapping
185
+
186
+ ### Mobile Manipulation
187
+ - Egocentric robot perception (360° view from mobile base)
188
+ - Scene understanding for pick-and-place
189
+
190
+ ### Robotdog Navigation
191
+ - Panoramic localization (where am I in the scene?)
192
+ - Terrain classification from multi-camera fusion
193
+
194
+ ---
195
+
196
+ ## Related Projects
197
+
198
+ - **[PyRoboFrames](https://github.com/Mullassery/PyRoboFrames)** — Fast ML dataloader for robot learning (core dependency)
199
+ - **[LeRobot](https://github.com/huggingface/lerobot)** — HuggingFace robotics datasets
200
+ - **[Segment Anything 3 (SAM3)](https://github.com/facebookresearch/segment-anything-3)** — Instance segmentation
201
+ - **[CLIP](https://github.com/openai/CLIP)** — Vision-language models
202
+ - **[Grounding DINO](https://github.com/IDEA-Research/GroundingDINO)** — Open-vocabulary detection
203
+
204
+ ---
205
+
206
+ ## License
207
+
208
+ MIT (same as PyRoboFrames)
209
+
210
+ ---
211
+
212
+ ## Contributing
213
+
214
+ Contributions welcome. Please open issues and PRs on GitHub.
215
+
216
+ For architectural decisions, see [ARCHITECTURE.md](./ARCHITECTURE.md).
217
+
218
+ ---
219
+
220
+ ## Citation
221
+
222
+ ```bibtex
223
+ @software{mullassery2025pyrobovision,
224
+ title={PyRoboVision: Advanced perception and vision-language models for robotics},
225
+ author={Mullassery, Georgi},
226
+ url={https://github.com/Mullassery/PyRoboVision},
227
+ year={2025}
228
+ }
229
+ ```
@@ -0,0 +1,184 @@
1
+ # PyRoboVision
2
+
3
+ Advanced autonomous driving perception and vision-language foundation models for robotics. Built on top of [PyRoboFrames](https://github.com/Mullassery/PyRoboFrames).
4
+
5
+ **Focus:** Advanced perception pipelines and multimodal understanding that consume data loaded by PyRoboFrames.
6
+
7
+ ---
8
+
9
+ ## What's Inside
10
+
11
+ ### Autonomous Driving (v0.5)
12
+ - **Cylindrical panoramic stitching** — 360° multi-camera fusion (Waymo, nuScenes)
13
+ - **Advanced blending** — Laplacian pyramid + graph-cut seams
14
+ - **Bird's-eye-view (BEV)** — 3D projection for autonomous perception
15
+ - **GPU acceleration** — CuPy (NVIDIA), MLX (Apple Silicon), NumPy (CPU)
16
+ - **Sensor fusion** — Lidar/Radar + occupancy grid mapping
17
+ - **Dataset loaders** — Waymo TFRecord, nuScenes JSON, KITTI stereo
18
+
19
+ ### Foundation Models (Phase 7)
20
+ - **SAM3 segmentation** — Instance segmentation + temporal tracking
21
+ - **CLIP embeddings** — Scene understanding, text-image similarity
22
+ - **Grounding DINO** — Open-vocabulary object detection
23
+ - **Multi-modal fusion** — Unified detection + segmentation + classification
24
+
25
+ ---
26
+
27
+ ## Installation
28
+
29
+ ```bash
30
+ # Requires PyRoboFrames v1.0+
31
+ pip install pyroboframes pyroboframes-vision
32
+
33
+ # From source
34
+ git clone https://github.com/Mullassery/PyRoboVision.git
35
+ cd PyRoboVision
36
+ pip install -e .
37
+ ```
38
+
39
+ ---
40
+
41
+ ## Quick Start
42
+
43
+ ### Autonomous Driving: 360° Panoramic Perception
44
+
45
+ ```python
46
+ from pyrobovision.automotive import (
47
+ CylindricalStitcher,
48
+ get_waymo_layout,
49
+ )
50
+
51
+ # Stitch 5 cameras into 360° panorama
52
+ layout = get_waymo_layout()
53
+ stitcher = CylindricalStitcher(layout, blend_method="laplacian")
54
+
55
+ frames = {
56
+ "FRONT": ...,
57
+ "FRONT_LEFT": ...,
58
+ # ... other cameras
59
+ }
60
+
61
+ panorama = stitcher.stitch(frames) # [1, H, W, 3] seamless 360°
62
+ ```
63
+
64
+ ### Foundation Models: Multi-Modal Scene Understanding
65
+
66
+ ```python
67
+ from pyrobovision.foundation_models import MultiModalFusion
68
+
69
+ fusion = MultiModalFusion(
70
+ detection_prompt="car . pedestrian . cyclist",
71
+ device="mlx", # or "cuda"
72
+ )
73
+
74
+ scene = fusion.understand(frame)
75
+ for obj in scene.objects:
76
+ print(f"{obj.object_class}: {obj.semantic_label}")
77
+ ```
78
+
79
+ ---
80
+
81
+ ## Architecture
82
+
83
+ ### Dependency Graph
84
+
85
+ ```
86
+ PyRoboVision/
87
+ ├── automotive/ # v0.5 AV perception
88
+ │ ├── stitching.py
89
+ │ ├── blending.py
90
+ │ ├── bev.py
91
+ │ ├── perception_3d.py
92
+ │ ├── tfrecord_utils.py
93
+ │ ├── nuscenes_utils.py
94
+ │ └── datasets.py
95
+
96
+ └── foundation_models/ # Phase 7
97
+ ├── sam3_segmentation.py
98
+ ├── clip_embeddings.py
99
+ ├── grounding_dino.py
100
+ └── multimodal_fusion.py
101
+
102
+ ↓ Depends on PyRoboFrames v1.0+ (dataloader)
103
+ PyRoboFrames/
104
+ ├── RoboFrameDataset # Load LeRobot
105
+ ├── ProprioceptiveLoader # Load state/action
106
+ ├── DataLoader # Device selection
107
+ └── [video decode, sensor fusion, etc.]
108
+ ```
109
+
110
+ **Key design:** PyRoboVision is a consumer library, not a foundation. It uses PyRoboFrames to load data, then applies perception algorithms.
111
+
112
+ ---
113
+
114
+ ## Features
115
+
116
+ | Phase | Feature | Status | Tests |
117
+ |-------|---------|--------|-------|
118
+ | **1** | Cylindrical panoramic projection | ✅ | 10 |
119
+ | **2** | Laplacian pyramid blending | ✅ | 5 |
120
+ | **3** | Bird's-eye-view (BEV) projection | ✅ | 5 |
121
+ | **4a** | GPU acceleration (CuPy/MLX/NumPy) | ✅ | 6 |
122
+ | **4b** | Optical flow seam tracking | ✅ | 10 |
123
+ | **5** | Waymo/nuScenes/KITTI loaders | ✅ | 9 |
124
+ | **6** | Lidar/Radar fusion + Occupancy grids | ✅ | 18 |
125
+ | **7a** | SAM3 temporal segmentation | ✅ | 18 |
126
+ | **7b** | CLIP scene embeddings | ✅ | 25 |
127
+ | **7c** | Grounding DINO detection | ✅ | 26 |
128
+ | **7d** | Multi-modal fusion | ✅ | 17 |
129
+
130
+ **Total: 149 tests, all passing**
131
+
132
+ ---
133
+
134
+ ## Use Cases
135
+
136
+ ### Autonomous Driving
137
+ - Waymo perception pipeline (panoramic stitching + 3D fusion)
138
+ - nuScenes multi-camera understanding
139
+ - Real-time BEV mapping
140
+
141
+ ### Mobile Manipulation
142
+ - Egocentric robot perception (360° view from mobile base)
143
+ - Scene understanding for pick-and-place
144
+
145
+ ### Robotdog Navigation
146
+ - Panoramic localization (where am I in the scene?)
147
+ - Terrain classification from multi-camera fusion
148
+
149
+ ---
150
+
151
+ ## Related Projects
152
+
153
+ - **[PyRoboFrames](https://github.com/Mullassery/PyRoboFrames)** — Fast ML dataloader for robot learning (core dependency)
154
+ - **[LeRobot](https://github.com/huggingface/lerobot)** — HuggingFace robotics datasets
155
+ - **[Segment Anything 3 (SAM3)](https://github.com/facebookresearch/segment-anything-3)** — Instance segmentation
156
+ - **[CLIP](https://github.com/openai/CLIP)** — Vision-language models
157
+ - **[Grounding DINO](https://github.com/IDEA-Research/GroundingDINO)** — Open-vocabulary detection
158
+
159
+ ---
160
+
161
+ ## License
162
+
163
+ MIT (same as PyRoboFrames)
164
+
165
+ ---
166
+
167
+ ## Contributing
168
+
169
+ Contributions welcome. Please open issues and PRs on GitHub.
170
+
171
+ For architectural decisions, see [ARCHITECTURE.md](./ARCHITECTURE.md).
172
+
173
+ ---
174
+
175
+ ## Citation
176
+
177
+ ```bibtex
178
+ @software{mullassery2025pyrobovision,
179
+ title={PyRoboVision: Advanced perception and vision-language models for robotics},
180
+ author={Mullassery, Georgi},
181
+ url={https://github.com/Mullassery/PyRoboVision},
182
+ year={2025}
183
+ }
184
+ ```
@@ -0,0 +1,98 @@
1
+ [build-system]
2
+ requires = ["setuptools>=68.0", "wheel"]
3
+ build-backend = "setuptools.build_meta"
4
+
5
+ [project]
6
+ name = "pyrobovision"
7
+ version = "0.5.0"
8
+ description = "Advanced autonomous driving perception and vision-language foundation models for robotics"
9
+ readme = "README.md"
10
+ license = { text = "MIT" }
11
+ authors = [{ name = "Georgi Mammen Mullassery", email = "mullassery@gmail.com" }]
12
+ keywords = [
13
+ "robotics",
14
+ "autonomous-driving",
15
+ "perception",
16
+ "computer-vision",
17
+ "foundation-models",
18
+ "sam3",
19
+ "clip",
20
+ "grounding-dino",
21
+ ]
22
+ classifiers = [
23
+ "Development Status :: 4 - Beta",
24
+ "Intended Audience :: Developers",
25
+ "Intended Audience :: Science/Research",
26
+ "License :: OSI Approved :: MIT License",
27
+ "Operating System :: OS Independent",
28
+ "Programming Language :: Python :: 3",
29
+ "Programming Language :: Python :: 3.10",
30
+ "Programming Language :: Python :: 3.11",
31
+ "Programming Language :: Python :: 3.12",
32
+ "Programming Language :: Python :: 3.13",
33
+ "Topic :: Scientific/Engineering :: Artificial Intelligence",
34
+ "Topic :: Scientific/Engineering :: Image Recognition",
35
+ ]
36
+ requires-python = ">=3.10"
37
+
38
+ dependencies = [
39
+ "pyroboframes>=1.0.0",
40
+ "numpy>=1.24.0",
41
+ "torch>=2.0.0",
42
+ "torchvision>=0.15.0",
43
+ "transformers>=4.30.0",
44
+ "scipy>=1.10.0",
45
+ "open3d>=0.17.0",
46
+ ]
47
+
48
+ [project.optional-dependencies]
49
+ dev = [
50
+ "pytest>=7.4.0",
51
+ "pytest-cov>=4.1.0",
52
+ "black>=23.0.0",
53
+ "isort>=5.12.0",
54
+ "mypy>=1.4.0",
55
+ "ruff>=0.0.285",
56
+ ]
57
+
58
+ cuda = [
59
+ "cupy>=12.0.0",
60
+ "torch[cuda11x]>=2.0.0",
61
+ ]
62
+
63
+ mlx = [
64
+ "mlx>=0.0.13",
65
+ ]
66
+
67
+ [project.urls]
68
+ Homepage = "https://github.com/Mullassery/PyRoboVision"
69
+ Documentation = "https://github.com/Mullassery/PyRoboVision/blob/main/README.md"
70
+ Repository = "https://github.com/Mullassery/PyRoboVision"
71
+ "Bug Tracker" = "https://github.com/Mullassery/PyRoboVision/issues"
72
+
73
+ [tool.setuptools]
74
+ packages = ["pyrobovision"]
75
+
76
+ [tool.setuptools.package-data]
77
+ pyrobovision = ["py.typed"]
78
+
79
+ [tool.black]
80
+ line-length = 100
81
+ target-version = ["py310", "py311", "py312", "py313"]
82
+
83
+ [tool.isort]
84
+ profile = "black"
85
+ line_length = 100
86
+ multi_line_mode = 3
87
+ skip_glob = ["*.pyi"]
88
+
89
+ [tool.pytest.ini_options]
90
+ testpaths = ["tests"]
91
+ addopts = "-v --cov=pyrobovision --cov-report=term-missing"
92
+ asyncio_mode = "auto"
93
+
94
+ [tool.mypy]
95
+ python_version = "3.10"
96
+ warn_return_any = true
97
+ warn_unused_configs = true
98
+ disallow_untyped_defs = false
File without changes