vlm-engine 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- vlm_engine-0.1.0/PKG-INFO +309 -0
- vlm_engine-0.1.0/README.md +276 -0
- vlm_engine-0.1.0/setup.cfg +4 -0
- vlm_engine-0.1.0/setup.py +32 -0
- vlm_engine-0.1.0/vlm_engine/__init__.py +11 -0
- vlm_engine-0.1.0/vlm_engine/async_utils.py +131 -0
- vlm_engine-0.1.0/vlm_engine/config_models.py +46 -0
- vlm_engine-0.1.0/vlm_engine/dynamic_ai.py +101 -0
- vlm_engine-0.1.0/vlm_engine/engine.py +55 -0
- vlm_engine-0.1.0/vlm_engine/model_wrapper.py +8 -0
- vlm_engine-0.1.0/vlm_engine/models.py +251 -0
- vlm_engine-0.1.0/vlm_engine/pipeline.py +132 -0
- vlm_engine-0.1.0/vlm_engine/postprocessing.py +268 -0
- vlm_engine-0.1.0/vlm_engine/preprocessing.py +183 -0
- vlm_engine-0.1.0/vlm_engine/python_functions.py +122 -0
- vlm_engine-0.1.0/vlm_engine/vlm_client.py +163 -0
- vlm_engine-0.1.0/vlm_engine.egg-info/PKG-INFO +309 -0
- vlm_engine-0.1.0/vlm_engine.egg-info/SOURCES.txt +19 -0
- vlm_engine-0.1.0/vlm_engine.egg-info/dependency_links.txt +1 -0
- vlm_engine-0.1.0/vlm_engine.egg-info/requires.txt +7 -0
- vlm_engine-0.1.0/vlm_engine.egg-info/top_level.txt +1 -0
|
@@ -0,0 +1,309 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: vlm_engine
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Advanced Vision-Language Model Engine for content tagging
|
|
5
|
+
Home-page: https://github.com/Haven-hvn/haven-vlm-engine-package
|
|
6
|
+
Author: HAVEN Network
|
|
7
|
+
Author-email: officialhavennetwork@gmail.com
|
|
8
|
+
Classifier: Development Status :: 3 - Alpha
|
|
9
|
+
Classifier: Intended Audience :: Developers
|
|
10
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
11
|
+
Classifier: Programming Language :: Python :: 3
|
|
12
|
+
Classifier: Programming Language :: Python :: 3.8
|
|
13
|
+
Classifier: Programming Language :: Python :: 3.9
|
|
14
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
15
|
+
Requires-Python: >=3.8
|
|
16
|
+
Description-Content-Type: text/markdown
|
|
17
|
+
Requires-Dist: pydantic
|
|
18
|
+
Requires-Dist: numpy
|
|
19
|
+
Requires-Dist: torch
|
|
20
|
+
Requires-Dist: torchvision
|
|
21
|
+
Requires-Dist: aiohttp
|
|
22
|
+
Requires-Dist: pyyaml
|
|
23
|
+
Requires-Dist: opencv-python
|
|
24
|
+
Dynamic: author
|
|
25
|
+
Dynamic: author-email
|
|
26
|
+
Dynamic: classifier
|
|
27
|
+
Dynamic: description
|
|
28
|
+
Dynamic: description-content-type
|
|
29
|
+
Dynamic: home-page
|
|
30
|
+
Dynamic: requires-dist
|
|
31
|
+
Dynamic: requires-python
|
|
32
|
+
Dynamic: summary
|
|
33
|
+
|
|
34
|
+
# VLM Engine
|
|
35
|
+
|
|
36
|
+
A high-performance Python package for Vision-Language Model (VLM) based content tagging and analysis. This package provides an advanced implementation for automatic content detection and tagging, delivering superior accuracy compared to traditional image classification methods.
|
|
37
|
+
|
|
38
|
+
## Features
|
|
39
|
+
|
|
40
|
+
- **Remote VLM Integration**: Connects to any OpenAI-compatible VLM endpoint (no local model loading required)
|
|
41
|
+
- **Context-Aware Detection**: Leverages Vision-Language Models' understanding of visual relationships for accurate content tagging
|
|
42
|
+
- **Flexible Architecture**: Modular pipeline system with configurable models and processing stages
|
|
43
|
+
- **Asynchronous Processing**: Built on asyncio for efficient video and image processing
|
|
44
|
+
- **Customizable Tag Sets**: Easy configuration of detection categories and confidence thresholds
|
|
45
|
+
- **Production Ready**: Includes retry logic, error handling, and comprehensive logging
|
|
46
|
+
|
|
47
|
+
## Installation
|
|
48
|
+
|
|
49
|
+
### From PyPI (when published)
|
|
50
|
+
```bash
|
|
51
|
+
pip install vlm-engine
|
|
52
|
+
```
|
|
53
|
+
|
|
54
|
+
### From Source
|
|
55
|
+
```bash
|
|
56
|
+
git clone https://github.com/Haven-hvn/haven-vlm-engine-package.git
|
|
57
|
+
cd vlm-engine-package
|
|
58
|
+
pip install -e .
|
|
59
|
+
```
|
|
60
|
+
|
|
61
|
+
### Requirements
|
|
62
|
+
- Python 3.8+
|
|
63
|
+
- **Sufficient RAM**: Video preprocessing loads entire videos into memory (not GPU memory)
|
|
64
|
+
- Compatible VLM server endpoint:
|
|
65
|
+
- Remote OpenAI-compatible API (recommended)
|
|
66
|
+
- Local server using [LM Studio](https://lmstudio.ai/)
|
|
67
|
+
- Haven's custom VLM available at [https://havenmodels.orbiter.website/](https://havenmodels.orbiter.website/)
|
|
68
|
+
|
|
69
|
+
## Quick Start
|
|
70
|
+
|
|
71
|
+
```python
|
|
72
|
+
import asyncio
|
|
73
|
+
from vlm_engine import VLMEngine
|
|
74
|
+
from vlm_engine.config_models import EngineConfig, ModelConfig
|
|
75
|
+
|
|
76
|
+
# Configure the engine
|
|
77
|
+
config = EngineConfig(
|
|
78
|
+
active_ai_models=["vlm_nsfw_model"],
|
|
79
|
+
models={
|
|
80
|
+
"vlm_nsfw_model": ModelConfig(
|
|
81
|
+
type="vlm_model",
|
|
82
|
+
model_id="HuggingFaceTB/SmolVLM-Instruct",
|
|
83
|
+
api_base_url="http://localhost:7045",
|
|
84
|
+
tag_list=["tag1", "tag2", "tag3"] # Your custom tags
|
|
85
|
+
)
|
|
86
|
+
}
|
|
87
|
+
)
|
|
88
|
+
|
|
89
|
+
# Initialize and use
|
|
90
|
+
async def main():
|
|
91
|
+
engine = VLMEngine(config)
|
|
92
|
+
await engine.initialize()
|
|
93
|
+
|
|
94
|
+
results = await engine.process_video(
|
|
95
|
+
"path/to/video.mp4",
|
|
96
|
+
frame_interval=2.0,
|
|
97
|
+
threshold=0.5
|
|
98
|
+
)
|
|
99
|
+
print(f"Detected tags: {results}")
|
|
100
|
+
|
|
101
|
+
asyncio.run(main())
|
|
102
|
+
```
|
|
103
|
+
|
|
104
|
+
## Architecture
|
|
105
|
+
|
|
106
|
+
### Core Components
|
|
107
|
+
|
|
108
|
+
1. **VLMEngine**: Main entry point for the package
|
|
109
|
+
- Manages model initialization and pipeline execution
|
|
110
|
+
- Handles asynchronous processing of videos and images
|
|
111
|
+
|
|
112
|
+
2. **VLMClient**: OpenAI-compatible API client
|
|
113
|
+
- Supports any VLM with chat completions endpoint
|
|
114
|
+
- Includes retry logic with exponential backoff and jitter
|
|
115
|
+
- Handles image encoding and prompt formatting
|
|
116
|
+
|
|
117
|
+
3. **Pipeline System**: Flexible processing pipeline
|
|
118
|
+
- Modular design allows custom processing stages
|
|
119
|
+
- Built-in support for preprocessing, analysis, and postprocessing
|
|
120
|
+
- Configurable through YAML or Python objects
|
|
121
|
+
|
|
122
|
+
4. **Model Management**: Dynamic model loading
|
|
123
|
+
- Supports multiple model types (VLM, preprocessors, postprocessors)
|
|
124
|
+
- Lazy loading for efficient resource usage
|
|
125
|
+
- Thread-safe model access
|
|
126
|
+
|
|
127
|
+
## Configuration
|
|
128
|
+
|
|
129
|
+
### Basic Configuration
|
|
130
|
+
|
|
131
|
+
```python
|
|
132
|
+
from vlm_engine.config_models import EngineConfig, ModelConfig, PipelineConfig
|
|
133
|
+
|
|
134
|
+
config = EngineConfig(
|
|
135
|
+
active_ai_models=["my_vlm_model"],
|
|
136
|
+
models={
|
|
137
|
+
"my_vlm_model": ModelConfig(
|
|
138
|
+
type="vlm_model",
|
|
139
|
+
model_id="model-name",
|
|
140
|
+
api_base_url="http://localhost:8000",
|
|
141
|
+
tag_list=["action1", "action2", "action3"],
|
|
142
|
+
max_new_tokens=128,
|
|
143
|
+
request_timeout=70,
|
|
144
|
+
vlm_detected_tag_confidence=0.99
|
|
145
|
+
)
|
|
146
|
+
},
|
|
147
|
+
pipelines={
|
|
148
|
+
"video_pipeline": PipelineConfig(
|
|
149
|
+
inputs=["video_path", "frame_interval"],
|
|
150
|
+
output="results",
|
|
151
|
+
models=[{"name": "my_vlm_model", "inputs": ["frame"], "outputs": "tags"}]
|
|
152
|
+
)
|
|
153
|
+
}
|
|
154
|
+
)
|
|
155
|
+
```
|
|
156
|
+
|
|
157
|
+
### Advanced Configuration
|
|
158
|
+
|
|
159
|
+
The package supports complex configurations including:
|
|
160
|
+
- Multiple models in a pipeline
|
|
161
|
+
- Custom preprocessing and postprocessing stages
|
|
162
|
+
- Category-specific settings (thresholds, durations, etc.)
|
|
163
|
+
- Batch processing configurations
|
|
164
|
+
|
|
165
|
+
See the [examples](examples/) directory for detailed configuration examples.
|
|
166
|
+
|
|
167
|
+
## API Reference
|
|
168
|
+
|
|
169
|
+
### VLMEngine
|
|
170
|
+
|
|
171
|
+
```python
|
|
172
|
+
class VLMEngine:
|
|
173
|
+
def __init__(self, config: EngineConfig)
|
|
174
|
+
async def initialize()
|
|
175
|
+
async def process_video(video_path: str, **kwargs) -> Dict[str, Any]
|
|
176
|
+
```
|
|
177
|
+
|
|
178
|
+
### Processing Parameters
|
|
179
|
+
|
|
180
|
+
- `video_path`: Path to the video file
|
|
181
|
+
- `frame_interval`: Seconds between frame samples (default: 0.5)
|
|
182
|
+
- `threshold`: Confidence threshold for tag detection (default: 0.5)
|
|
183
|
+
- `return_timestamps`: Include timestamp information (default: True)
|
|
184
|
+
- `return_confidence`: Include confidence scores (default: True)
|
|
185
|
+
|
|
186
|
+
## Performance Optimization
|
|
187
|
+
|
|
188
|
+
### Memory Requirements
|
|
189
|
+
- **Important**: Video preprocessing loads the entire video into system RAM (not GPU memory)
|
|
190
|
+
- Ensure sufficient RAM for your video sizes (e.g., a 1GB video may require 4-8GB of available RAM)
|
|
191
|
+
- Consider processing videos in segments for very large files
|
|
192
|
+
|
|
193
|
+
### API Optimization
|
|
194
|
+
- Configure retry settings based on your VLM server's capacity
|
|
195
|
+
- Adjust `max_new_tokens` to balance speed vs accuracy
|
|
196
|
+
- Use appropriate `frame_interval` to reduce processing time and API calls
|
|
197
|
+
|
|
198
|
+
### Processing Speed
|
|
199
|
+
- Increase `frame_interval` to sample fewer frames (faster but less accurate)
|
|
200
|
+
- Use batch processing when your VLM endpoint supports it
|
|
201
|
+
- Consider running multiple VLM instances for parallel processing
|
|
202
|
+
|
|
203
|
+
## Extending the Package
|
|
204
|
+
|
|
205
|
+
### Custom Models
|
|
206
|
+
|
|
207
|
+
Create custom model classes by inheriting from the base Model class:
|
|
208
|
+
|
|
209
|
+
```python
|
|
210
|
+
from vlm_engine.models import Model
|
|
211
|
+
|
|
212
|
+
class CustomModel(Model):
|
|
213
|
+
async def process(self, inputs):
|
|
214
|
+
# Your custom processing logic
|
|
215
|
+
return results
|
|
216
|
+
```
|
|
217
|
+
|
|
218
|
+
### Custom Pipelines
|
|
219
|
+
|
|
220
|
+
Define custom pipelines for specific use cases:
|
|
221
|
+
|
|
222
|
+
```python
|
|
223
|
+
custom_pipeline = PipelineConfig(
|
|
224
|
+
inputs=["image_path"],
|
|
225
|
+
output="analysis",
|
|
226
|
+
models=[
|
|
227
|
+
{"name": "preprocessor", "inputs": ["image_path"], "outputs": "processed_image"},
|
|
228
|
+
{"name": "analyzer", "inputs": ["processed_image"], "outputs": "analysis"}
|
|
229
|
+
]
|
|
230
|
+
)
|
|
231
|
+
```
|
|
232
|
+
|
|
233
|
+
## Troubleshooting
|
|
234
|
+
|
|
235
|
+
### Common Issues
|
|
236
|
+
|
|
237
|
+
1. **Connection Errors**
|
|
238
|
+
- Ensure your VLM server is running and accessible
|
|
239
|
+
- Check the `api_base_url` configuration
|
|
240
|
+
- Verify firewall settings
|
|
241
|
+
|
|
242
|
+
2. **GPU Memory Errors**
|
|
243
|
+
- Reduce batch size or frame interval
|
|
244
|
+
- Ensure proper CUDA installation
|
|
245
|
+
- Check GPU memory availability
|
|
246
|
+
|
|
247
|
+
3. **Slow Processing**
|
|
248
|
+
- Increase frame interval for faster processing
|
|
249
|
+
- Use GPU acceleration if available
|
|
250
|
+
- Optimize VLM server settings
|
|
251
|
+
|
|
252
|
+
### Logging
|
|
253
|
+
|
|
254
|
+
Enable debug logging for troubleshooting:
|
|
255
|
+
|
|
256
|
+
```python
|
|
257
|
+
import logging
|
|
258
|
+
logging.basicConfig(level=logging.DEBUG)
|
|
259
|
+
```
|
|
260
|
+
|
|
261
|
+
## Contributing
|
|
262
|
+
|
|
263
|
+
Contributions are welcome! Please feel free to submit a Pull Request. For major changes, please open an issue first to discuss what you would like to change.
|
|
264
|
+
|
|
265
|
+
### Development Setup
|
|
266
|
+
|
|
267
|
+
```bash
|
|
268
|
+
git clone https://github.com/yourusername/vlm-engine.git
|
|
269
|
+
cd vlm-engine
|
|
270
|
+
pip install -e ".[dev]"
|
|
271
|
+
```
|
|
272
|
+
|
|
273
|
+
### Running Tests
|
|
274
|
+
|
|
275
|
+
```bash
|
|
276
|
+
pytest tests/
|
|
277
|
+
```
|
|
278
|
+
|
|
279
|
+
## License
|
|
280
|
+
|
|
281
|
+
This project is licensed under the MIT License - see the [LICENSE](../LICENSE) file for details.
|
|
282
|
+
|
|
283
|
+
## Acknowledgments
|
|
284
|
+
|
|
285
|
+
- Built on top of modern Python async patterns
|
|
286
|
+
- Inspired by production ML serving architectures
|
|
287
|
+
- Haven's custom VLM models trained using [SmolVLM-Finetune](https://github.com/Haven-hvn/SmolVLM-Finetune) - Model Download found on [https://havenmodels.orbiter.website/](https://havenmodels.orbiter.website/)
|
|
288
|
+
|
|
289
|
+
- Designed for integration with OpenAI-compatible VLM endpoints
|
|
290
|
+
|
|
291
|
+
## Support
|
|
292
|
+
|
|
293
|
+
For issues and feature requests, please use the [GitHub issue tracker](https://github.com/Haven-hvn/haven-vlm-engine-package/issues).
|
|
294
|
+
|
|
295
|
+
For questions and discussions, join our community:
|
|
296
|
+
- Discord: [Link to Discord](https://discord.gg/57mPMDfQew)
|
|
297
|
+
|
|
298
|
+
---
|
|
299
|
+
|
|
300
|
+
**Note**: This package requires an OpenAI-compatible VLM endpoint. Options include:
|
|
301
|
+
|
|
302
|
+
### Remote Services
|
|
303
|
+
- Any OpenAI-compatible API endpoint
|
|
304
|
+
- Akash deployment - https://github.com/Haven-hvn/haven-inference
|
|
305
|
+
|
|
306
|
+
### Local Setup
|
|
307
|
+
- [LM Studio](https://lmstudio.ai/) - Easy local VLM hosting with OpenAI-compatible API
|
|
308
|
+
|
|
309
|
+
The package **does not** load VLM models directly - it communicates with external VLM services via API.
|
|
@@ -0,0 +1,276 @@
|
|
|
1
|
+
# VLM Engine
|
|
2
|
+
|
|
3
|
+
A high-performance Python package for Vision-Language Model (VLM) based content tagging and analysis. This package provides an advanced implementation for automatic content detection and tagging, delivering superior accuracy compared to traditional image classification methods.
|
|
4
|
+
|
|
5
|
+
## Features
|
|
6
|
+
|
|
7
|
+
- **Remote VLM Integration**: Connects to any OpenAI-compatible VLM endpoint (no local model loading required)
|
|
8
|
+
- **Context-Aware Detection**: Leverages Vision-Language Models' understanding of visual relationships for accurate content tagging
|
|
9
|
+
- **Flexible Architecture**: Modular pipeline system with configurable models and processing stages
|
|
10
|
+
- **Asynchronous Processing**: Built on asyncio for efficient video and image processing
|
|
11
|
+
- **Customizable Tag Sets**: Easy configuration of detection categories and confidence thresholds
|
|
12
|
+
- **Production Ready**: Includes retry logic, error handling, and comprehensive logging
|
|
13
|
+
|
|
14
|
+
## Installation
|
|
15
|
+
|
|
16
|
+
### From PyPI (when published)
|
|
17
|
+
```bash
|
|
18
|
+
pip install vlm-engine
|
|
19
|
+
```
|
|
20
|
+
|
|
21
|
+
### From Source
|
|
22
|
+
```bash
|
|
23
|
+
git clone https://github.com/Haven-hvn/haven-vlm-engine-package.git
|
|
24
|
+
cd vlm-engine-package
|
|
25
|
+
pip install -e .
|
|
26
|
+
```
|
|
27
|
+
|
|
28
|
+
### Requirements
|
|
29
|
+
- Python 3.8+
|
|
30
|
+
- **Sufficient RAM**: Video preprocessing loads entire videos into memory (not GPU memory)
|
|
31
|
+
- Compatible VLM server endpoint:
|
|
32
|
+
- Remote OpenAI-compatible API (recommended)
|
|
33
|
+
- Local server using [LM Studio](https://lmstudio.ai/)
|
|
34
|
+
- Haven's custom VLM available at [https://havenmodels.orbiter.website/](https://havenmodels.orbiter.website/)
|
|
35
|
+
|
|
36
|
+
## Quick Start
|
|
37
|
+
|
|
38
|
+
```python
|
|
39
|
+
import asyncio
|
|
40
|
+
from vlm_engine import VLMEngine
|
|
41
|
+
from vlm_engine.config_models import EngineConfig, ModelConfig
|
|
42
|
+
|
|
43
|
+
# Configure the engine
|
|
44
|
+
config = EngineConfig(
|
|
45
|
+
active_ai_models=["vlm_nsfw_model"],
|
|
46
|
+
models={
|
|
47
|
+
"vlm_nsfw_model": ModelConfig(
|
|
48
|
+
type="vlm_model",
|
|
49
|
+
model_id="HuggingFaceTB/SmolVLM-Instruct",
|
|
50
|
+
api_base_url="http://localhost:7045",
|
|
51
|
+
tag_list=["tag1", "tag2", "tag3"] # Your custom tags
|
|
52
|
+
)
|
|
53
|
+
}
|
|
54
|
+
)
|
|
55
|
+
|
|
56
|
+
# Initialize and use
|
|
57
|
+
async def main():
|
|
58
|
+
engine = VLMEngine(config)
|
|
59
|
+
await engine.initialize()
|
|
60
|
+
|
|
61
|
+
results = await engine.process_video(
|
|
62
|
+
"path/to/video.mp4",
|
|
63
|
+
frame_interval=2.0,
|
|
64
|
+
threshold=0.5
|
|
65
|
+
)
|
|
66
|
+
print(f"Detected tags: {results}")
|
|
67
|
+
|
|
68
|
+
asyncio.run(main())
|
|
69
|
+
```
|
|
70
|
+
|
|
71
|
+
## Architecture
|
|
72
|
+
|
|
73
|
+
### Core Components
|
|
74
|
+
|
|
75
|
+
1. **VLMEngine**: Main entry point for the package
|
|
76
|
+
- Manages model initialization and pipeline execution
|
|
77
|
+
- Handles asynchronous processing of videos and images
|
|
78
|
+
|
|
79
|
+
2. **VLMClient**: OpenAI-compatible API client
|
|
80
|
+
- Supports any VLM with chat completions endpoint
|
|
81
|
+
- Includes retry logic with exponential backoff and jitter
|
|
82
|
+
- Handles image encoding and prompt formatting
|
|
83
|
+
|
|
84
|
+
3. **Pipeline System**: Flexible processing pipeline
|
|
85
|
+
- Modular design allows custom processing stages
|
|
86
|
+
- Built-in support for preprocessing, analysis, and postprocessing
|
|
87
|
+
- Configurable through YAML or Python objects
|
|
88
|
+
|
|
89
|
+
4. **Model Management**: Dynamic model loading
|
|
90
|
+
- Supports multiple model types (VLM, preprocessors, postprocessors)
|
|
91
|
+
- Lazy loading for efficient resource usage
|
|
92
|
+
- Thread-safe model access
|
|
93
|
+
|
|
94
|
+
## Configuration
|
|
95
|
+
|
|
96
|
+
### Basic Configuration
|
|
97
|
+
|
|
98
|
+
```python
|
|
99
|
+
from vlm_engine.config_models import EngineConfig, ModelConfig, PipelineConfig
|
|
100
|
+
|
|
101
|
+
config = EngineConfig(
|
|
102
|
+
active_ai_models=["my_vlm_model"],
|
|
103
|
+
models={
|
|
104
|
+
"my_vlm_model": ModelConfig(
|
|
105
|
+
type="vlm_model",
|
|
106
|
+
model_id="model-name",
|
|
107
|
+
api_base_url="http://localhost:8000",
|
|
108
|
+
tag_list=["action1", "action2", "action3"],
|
|
109
|
+
max_new_tokens=128,
|
|
110
|
+
request_timeout=70,
|
|
111
|
+
vlm_detected_tag_confidence=0.99
|
|
112
|
+
)
|
|
113
|
+
},
|
|
114
|
+
pipelines={
|
|
115
|
+
"video_pipeline": PipelineConfig(
|
|
116
|
+
inputs=["video_path", "frame_interval"],
|
|
117
|
+
output="results",
|
|
118
|
+
models=[{"name": "my_vlm_model", "inputs": ["frame"], "outputs": "tags"}]
|
|
119
|
+
)
|
|
120
|
+
}
|
|
121
|
+
)
|
|
122
|
+
```
|
|
123
|
+
|
|
124
|
+
### Advanced Configuration
|
|
125
|
+
|
|
126
|
+
The package supports complex configurations including:
|
|
127
|
+
- Multiple models in a pipeline
|
|
128
|
+
- Custom preprocessing and postprocessing stages
|
|
129
|
+
- Category-specific settings (thresholds, durations, etc.)
|
|
130
|
+
- Batch processing configurations
|
|
131
|
+
|
|
132
|
+
See the [examples](examples/) directory for detailed configuration examples.
|
|
133
|
+
|
|
134
|
+
## API Reference
|
|
135
|
+
|
|
136
|
+
### VLMEngine
|
|
137
|
+
|
|
138
|
+
```python
|
|
139
|
+
class VLMEngine:
|
|
140
|
+
def __init__(self, config: EngineConfig)
|
|
141
|
+
async def initialize()
|
|
142
|
+
async def process_video(video_path: str, **kwargs) -> Dict[str, Any]
|
|
143
|
+
```
|
|
144
|
+
|
|
145
|
+
### Processing Parameters
|
|
146
|
+
|
|
147
|
+
- `video_path`: Path to the video file
|
|
148
|
+
- `frame_interval`: Seconds between frame samples (default: 0.5)
|
|
149
|
+
- `threshold`: Confidence threshold for tag detection (default: 0.5)
|
|
150
|
+
- `return_timestamps`: Include timestamp information (default: True)
|
|
151
|
+
- `return_confidence`: Include confidence scores (default: True)
|
|
152
|
+
|
|
153
|
+
## Performance Optimization
|
|
154
|
+
|
|
155
|
+
### Memory Requirements
|
|
156
|
+
- **Important**: Video preprocessing loads the entire video into system RAM (not GPU memory)
|
|
157
|
+
- Ensure sufficient RAM for your video sizes (e.g., a 1GB video may require 4-8GB of available RAM)
|
|
158
|
+
- Consider processing videos in segments for very large files
|
|
159
|
+
|
|
160
|
+
### API Optimization
|
|
161
|
+
- Configure retry settings based on your VLM server's capacity
|
|
162
|
+
- Adjust `max_new_tokens` to balance speed vs accuracy
|
|
163
|
+
- Use appropriate `frame_interval` to reduce processing time and API calls
|
|
164
|
+
|
|
165
|
+
### Processing Speed
|
|
166
|
+
- Increase `frame_interval` to sample fewer frames (faster but less accurate)
|
|
167
|
+
- Use batch processing when your VLM endpoint supports it
|
|
168
|
+
- Consider running multiple VLM instances for parallel processing
|
|
169
|
+
|
|
170
|
+
## Extending the Package
|
|
171
|
+
|
|
172
|
+
### Custom Models
|
|
173
|
+
|
|
174
|
+
Create custom model classes by inheriting from the base Model class:
|
|
175
|
+
|
|
176
|
+
```python
|
|
177
|
+
from vlm_engine.models import Model
|
|
178
|
+
|
|
179
|
+
class CustomModel(Model):
|
|
180
|
+
async def process(self, inputs):
|
|
181
|
+
# Your custom processing logic
|
|
182
|
+
return results
|
|
183
|
+
```
|
|
184
|
+
|
|
185
|
+
### Custom Pipelines
|
|
186
|
+
|
|
187
|
+
Define custom pipelines for specific use cases:
|
|
188
|
+
|
|
189
|
+
```python
|
|
190
|
+
custom_pipeline = PipelineConfig(
|
|
191
|
+
inputs=["image_path"],
|
|
192
|
+
output="analysis",
|
|
193
|
+
models=[
|
|
194
|
+
{"name": "preprocessor", "inputs": ["image_path"], "outputs": "processed_image"},
|
|
195
|
+
{"name": "analyzer", "inputs": ["processed_image"], "outputs": "analysis"}
|
|
196
|
+
]
|
|
197
|
+
)
|
|
198
|
+
```
|
|
199
|
+
|
|
200
|
+
## Troubleshooting
|
|
201
|
+
|
|
202
|
+
### Common Issues
|
|
203
|
+
|
|
204
|
+
1. **Connection Errors**
|
|
205
|
+
- Ensure your VLM server is running and accessible
|
|
206
|
+
- Check the `api_base_url` configuration
|
|
207
|
+
- Verify firewall settings
|
|
208
|
+
|
|
209
|
+
2. **GPU Memory Errors**
|
|
210
|
+
- Reduce batch size or frame interval
|
|
211
|
+
- Ensure proper CUDA installation
|
|
212
|
+
- Check GPU memory availability
|
|
213
|
+
|
|
214
|
+
3. **Slow Processing**
|
|
215
|
+
- Increase frame interval for faster processing
|
|
216
|
+
- Use GPU acceleration if available
|
|
217
|
+
- Optimize VLM server settings
|
|
218
|
+
|
|
219
|
+
### Logging
|
|
220
|
+
|
|
221
|
+
Enable debug logging for troubleshooting:
|
|
222
|
+
|
|
223
|
+
```python
|
|
224
|
+
import logging
|
|
225
|
+
logging.basicConfig(level=logging.DEBUG)
|
|
226
|
+
```
|
|
227
|
+
|
|
228
|
+
## Contributing
|
|
229
|
+
|
|
230
|
+
Contributions are welcome! Please feel free to submit a Pull Request. For major changes, please open an issue first to discuss what you would like to change.
|
|
231
|
+
|
|
232
|
+
### Development Setup
|
|
233
|
+
|
|
234
|
+
```bash
|
|
235
|
+
git clone https://github.com/yourusername/vlm-engine.git
|
|
236
|
+
cd vlm-engine
|
|
237
|
+
pip install -e ".[dev]"
|
|
238
|
+
```
|
|
239
|
+
|
|
240
|
+
### Running Tests
|
|
241
|
+
|
|
242
|
+
```bash
|
|
243
|
+
pytest tests/
|
|
244
|
+
```
|
|
245
|
+
|
|
246
|
+
## License
|
|
247
|
+
|
|
248
|
+
This project is licensed under the MIT License - see the [LICENSE](../LICENSE) file for details.
|
|
249
|
+
|
|
250
|
+
## Acknowledgments
|
|
251
|
+
|
|
252
|
+
- Built on top of modern Python async patterns
|
|
253
|
+
- Inspired by production ML serving architectures
|
|
254
|
+
- Haven's custom VLM models trained using [SmolVLM-Finetune](https://github.com/Haven-hvn/SmolVLM-Finetune) - Model Download found on [https://havenmodels.orbiter.website/](https://havenmodels.orbiter.website/)
|
|
255
|
+
|
|
256
|
+
- Designed for integration with OpenAI-compatible VLM endpoints
|
|
257
|
+
|
|
258
|
+
## Support
|
|
259
|
+
|
|
260
|
+
For issues and feature requests, please use the [GitHub issue tracker](https://github.com/Haven-hvn/haven-vlm-engine-package/issues).
|
|
261
|
+
|
|
262
|
+
For questions and discussions, join our community:
|
|
263
|
+
- Discord: [Link to Discord](https://discord.gg/57mPMDfQew)
|
|
264
|
+
|
|
265
|
+
---
|
|
266
|
+
|
|
267
|
+
**Note**: This package requires an OpenAI-compatible VLM endpoint. Options include:
|
|
268
|
+
|
|
269
|
+
### Remote Services
|
|
270
|
+
- Any OpenAI-compatible API endpoint
|
|
271
|
+
- Akash deployment - https://github.com/Haven-hvn/haven-inference
|
|
272
|
+
|
|
273
|
+
### Local Setup
|
|
274
|
+
- [LM Studio](https://lmstudio.ai/) - Easy local VLM hosting with OpenAI-compatible API
|
|
275
|
+
|
|
276
|
+
The package **does not** load VLM models directly - it communicates with external VLM services via API.
|
|
@@ -0,0 +1,32 @@
|
|
|
1
|
+
from setuptools import setup, find_packages
|
|
2
|
+
|
|
3
|
+
setup(
|
|
4
|
+
name="vlm_engine",
|
|
5
|
+
version="0.1.0",
|
|
6
|
+
description="Advanced Vision-Language Model Engine for content tagging",
|
|
7
|
+
long_description=open("README.md").read(),
|
|
8
|
+
long_description_content_type="text/markdown",
|
|
9
|
+
author="HAVEN Network",
|
|
10
|
+
author_email="officialhavennetwork@gmail.com",
|
|
11
|
+
url="https://github.com/Haven-hvn/haven-vlm-engine-package",
|
|
12
|
+
packages=find_packages(),
|
|
13
|
+
install_requires=[
|
|
14
|
+
"pydantic",
|
|
15
|
+
"numpy",
|
|
16
|
+
"torch",
|
|
17
|
+
"torchvision",
|
|
18
|
+
"aiohttp",
|
|
19
|
+
"pyyaml",
|
|
20
|
+
"opencv-python"
|
|
21
|
+
],
|
|
22
|
+
classifiers=[
|
|
23
|
+
"Development Status :: 3 - Alpha",
|
|
24
|
+
"Intended Audience :: Developers",
|
|
25
|
+
"License :: OSI Approved :: MIT License",
|
|
26
|
+
"Programming Language :: Python :: 3",
|
|
27
|
+
"Programming Language :: Python :: 3.8",
|
|
28
|
+
"Programming Language :: Python :: 3.9",
|
|
29
|
+
"Programming Language :: Python :: 3.10",
|
|
30
|
+
],
|
|
31
|
+
python_requires=">=3.8",
|
|
32
|
+
)
|
|
@@ -0,0 +1,11 @@
|
|
|
1
|
+
"""HAVEN VLM Engine: Advanced Content Tagging System
|
|
2
|
+
|
|
3
|
+
This package provides an advanced Vision-Language Model implementation
|
|
4
|
+
for automatic content tagging, delivering superior accuracy compared
|
|
5
|
+
to traditional image classification methods.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from .engine import VLMEngine
|
|
9
|
+
|
|
10
|
+
__all__ = ["VLMEngine"]
|
|
11
|
+
__version__ = "0.1.0"
|