vlm-engine 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,309 @@
1
+ Metadata-Version: 2.4
2
+ Name: vlm_engine
3
+ Version: 0.1.0
4
+ Summary: Advanced Vision-Language Model Engine for content tagging
5
+ Home-page: https://github.com/Haven-hvn/haven-vlm-engine-package
6
+ Author: HAVEN Network
7
+ Author-email: officialhavennetwork@gmail.com
8
+ Classifier: Development Status :: 3 - Alpha
9
+ Classifier: Intended Audience :: Developers
10
+ Classifier: License :: OSI Approved :: MIT License
11
+ Classifier: Programming Language :: Python :: 3
12
+ Classifier: Programming Language :: Python :: 3.8
13
+ Classifier: Programming Language :: Python :: 3.9
14
+ Classifier: Programming Language :: Python :: 3.10
15
+ Requires-Python: >=3.8
16
+ Description-Content-Type: text/markdown
17
+ Requires-Dist: pydantic
18
+ Requires-Dist: numpy
19
+ Requires-Dist: torch
20
+ Requires-Dist: torchvision
21
+ Requires-Dist: aiohttp
22
+ Requires-Dist: pyyaml
23
+ Requires-Dist: opencv-python
24
+ Dynamic: author
25
+ Dynamic: author-email
26
+ Dynamic: classifier
27
+ Dynamic: description
28
+ Dynamic: description-content-type
29
+ Dynamic: home-page
30
+ Dynamic: requires-dist
31
+ Dynamic: requires-python
32
+ Dynamic: summary
33
+
34
+ # VLM Engine
35
+
36
+ A high-performance Python package for Vision-Language Model (VLM) based content tagging and analysis. This package provides an advanced implementation for automatic content detection and tagging, delivering superior accuracy compared to traditional image classification methods.
37
+
38
+ ## Features
39
+
40
+ - **Remote VLM Integration**: Connects to any OpenAI-compatible VLM endpoint (no local model loading required)
41
+ - **Context-Aware Detection**: Leverages Vision-Language Models' understanding of visual relationships for accurate content tagging
42
+ - **Flexible Architecture**: Modular pipeline system with configurable models and processing stages
43
+ - **Asynchronous Processing**: Built on asyncio for efficient video and image processing
44
+ - **Customizable Tag Sets**: Easy configuration of detection categories and confidence thresholds
45
+ - **Production Ready**: Includes retry logic, error handling, and comprehensive logging
46
+
47
+ ## Installation
48
+
49
+ ### From PyPI (when published)
50
+ ```bash
51
+ pip install vlm-engine
52
+ ```
53
+
54
+ ### From Source
55
+ ```bash
56
+ git clone https://github.com/Haven-hvn/haven-vlm-engine-package.git
57
+ cd vlm-engine-package
58
+ pip install -e .
59
+ ```
60
+
61
+ ### Requirements
62
+ - Python 3.8+
63
+ - **Sufficient RAM**: Video preprocessing loads entire videos into memory (not GPU memory)
64
+ - Compatible VLM server endpoint:
65
+ - Remote OpenAI-compatible API (recommended)
66
+ - Local server using [LM Studio](https://lmstudio.ai/)
67
+ - Haven's custom VLM available at [https://havenmodels.orbiter.website/](https://havenmodels.orbiter.website/)
68
+
69
+ ## Quick Start
70
+
71
+ ```python
72
+ import asyncio
73
+ from vlm_engine import VLMEngine
74
+ from vlm_engine.config_models import EngineConfig, ModelConfig
75
+
76
+ # Configure the engine
77
+ config = EngineConfig(
78
+ active_ai_models=["vlm_nsfw_model"],
79
+ models={
80
+ "vlm_nsfw_model": ModelConfig(
81
+ type="vlm_model",
82
+ model_id="HuggingFaceTB/SmolVLM-Instruct",
83
+ api_base_url="http://localhost:7045",
84
+ tag_list=["tag1", "tag2", "tag3"] # Your custom tags
85
+ )
86
+ }
87
+ )
88
+
89
+ # Initialize and use
90
+ async def main():
91
+ engine = VLMEngine(config)
92
+ await engine.initialize()
93
+
94
+ results = await engine.process_video(
95
+ "path/to/video.mp4",
96
+ frame_interval=2.0,
97
+ threshold=0.5
98
+ )
99
+ print(f"Detected tags: {results}")
100
+
101
+ asyncio.run(main())
102
+ ```
103
+
104
+ ## Architecture
105
+
106
+ ### Core Components
107
+
108
+ 1. **VLMEngine**: Main entry point for the package
109
+ - Manages model initialization and pipeline execution
110
+ - Handles asynchronous processing of videos and images
111
+
112
+ 2. **VLMClient**: OpenAI-compatible API client
113
+ - Supports any VLM with chat completions endpoint
114
+ - Includes retry logic with exponential backoff and jitter
115
+ - Handles image encoding and prompt formatting
116
+
117
+ 3. **Pipeline System**: Flexible processing pipeline
118
+ - Modular design allows custom processing stages
119
+ - Built-in support for preprocessing, analysis, and postprocessing
120
+ - Configurable through YAML or Python objects
121
+
122
+ 4. **Model Management**: Dynamic model loading
123
+ - Supports multiple model types (VLM, preprocessors, postprocessors)
124
+ - Lazy loading for efficient resource usage
125
+ - Thread-safe model access
126
+
127
+ ## Configuration
128
+
129
+ ### Basic Configuration
130
+
131
+ ```python
132
+ from vlm_engine.config_models import EngineConfig, ModelConfig, PipelineConfig
133
+
134
+ config = EngineConfig(
135
+ active_ai_models=["my_vlm_model"],
136
+ models={
137
+ "my_vlm_model": ModelConfig(
138
+ type="vlm_model",
139
+ model_id="model-name",
140
+ api_base_url="http://localhost:8000",
141
+ tag_list=["action1", "action2", "action3"],
142
+ max_new_tokens=128,
143
+ request_timeout=70,
144
+ vlm_detected_tag_confidence=0.99
145
+ )
146
+ },
147
+ pipelines={
148
+ "video_pipeline": PipelineConfig(
149
+ inputs=["video_path", "frame_interval"],
150
+ output="results",
151
+ models=[{"name": "my_vlm_model", "inputs": ["frame"], "outputs": "tags"}]
152
+ )
153
+ }
154
+ )
155
+ ```
156
+
157
+ ### Advanced Configuration
158
+
159
+ The package supports complex configurations including:
160
+ - Multiple models in a pipeline
161
+ - Custom preprocessing and postprocessing stages
162
+ - Category-specific settings (thresholds, durations, etc.)
163
+ - Batch processing configurations
164
+
165
+ See the [examples](examples/) directory for detailed configuration examples.
166
+
167
+ ## API Reference
168
+
169
+ ### VLMEngine
170
+
171
+ ```python
172
+ class VLMEngine:
173
+ def __init__(self, config: EngineConfig)
174
+ async def initialize()
175
+ async def process_video(video_path: str, **kwargs) -> Dict[str, Any]
176
+ ```
177
+
178
+ ### Processing Parameters
179
+
180
+ - `video_path`: Path to the video file
181
+ - `frame_interval`: Seconds between frame samples (default: 0.5)
182
+ - `threshold`: Confidence threshold for tag detection (default: 0.5)
183
+ - `return_timestamps`: Include timestamp information (default: True)
184
+ - `return_confidence`: Include confidence scores (default: True)
185
+
186
+ ## Performance Optimization
187
+
188
+ ### Memory Requirements
189
+ - **Important**: Video preprocessing loads the entire video into system RAM (not GPU memory)
190
+ - Ensure sufficient RAM for your video sizes (e.g., a 1GB video may require 4-8GB of available RAM)
191
+ - Consider processing videos in segments for very large files
192
+
193
+ ### API Optimization
194
+ - Configure retry settings based on your VLM server's capacity
195
+ - Adjust `max_new_tokens` to balance speed vs accuracy
196
+ - Use appropriate `frame_interval` to reduce processing time and API calls
197
+
198
+ ### Processing Speed
199
+ - Increase `frame_interval` to sample fewer frames (faster but less accurate)
200
+ - Use batch processing when your VLM endpoint supports it
201
+ - Consider running multiple VLM instances for parallel processing
202
+
203
+ ## Extending the Package
204
+
205
+ ### Custom Models
206
+
207
+ Create custom model classes by inheriting from the base Model class:
208
+
209
+ ```python
210
+ from vlm_engine.models import Model
211
+
212
+ class CustomModel(Model):
213
+ async def process(self, inputs):
214
+ # Your custom processing logic
215
+ return results
216
+ ```
217
+
218
+ ### Custom Pipelines
219
+
220
+ Define custom pipelines for specific use cases:
221
+
222
+ ```python
223
+ custom_pipeline = PipelineConfig(
224
+ inputs=["image_path"],
225
+ output="analysis",
226
+ models=[
227
+ {"name": "preprocessor", "inputs": ["image_path"], "outputs": "processed_image"},
228
+ {"name": "analyzer", "inputs": ["processed_image"], "outputs": "analysis"}
229
+ ]
230
+ )
231
+ ```
232
+
233
+ ## Troubleshooting
234
+
235
+ ### Common Issues
236
+
237
+ 1. **Connection Errors**
238
+ - Ensure your VLM server is running and accessible
239
+ - Check the `api_base_url` configuration
240
+ - Verify firewall settings
241
+
242
+ 2. **GPU Memory Errors**
243
+ - Reduce batch size or frame interval
244
+ - Ensure proper CUDA installation
245
+ - Check GPU memory availability
246
+
247
+ 3. **Slow Processing**
248
+ - Increase frame interval for faster processing
249
+ - Use GPU acceleration if available
250
+ - Optimize VLM server settings
251
+
252
+ ### Logging
253
+
254
+ Enable debug logging for troubleshooting:
255
+
256
+ ```python
257
+ import logging
258
+ logging.basicConfig(level=logging.DEBUG)
259
+ ```
260
+
261
+ ## Contributing
262
+
263
+ Contributions are welcome! Please feel free to submit a Pull Request. For major changes, please open an issue first to discuss what you would like to change.
264
+
265
+ ### Development Setup
266
+
267
+ ```bash
268
+ git clone https://github.com/yourusername/vlm-engine.git
269
+ cd vlm-engine
270
+ pip install -e ".[dev]"
271
+ ```
272
+
273
+ ### Running Tests
274
+
275
+ ```bash
276
+ pytest tests/
277
+ ```
278
+
279
+ ## License
280
+
281
+ This project is licensed under the MIT License - see the [LICENSE](../LICENSE) file for details.
282
+
283
+ ## Acknowledgments
284
+
285
+ - Built on top of modern Python async patterns
286
+ - Inspired by production ML serving architectures
287
+ - Haven's custom VLM models trained using [SmolVLM-Finetune](https://github.com/Haven-hvn/SmolVLM-Finetune) - Model Download found on [https://havenmodels.orbiter.website/](https://havenmodels.orbiter.website/)
288
+
289
+ - Designed for integration with OpenAI-compatible VLM endpoints
290
+
291
+ ## Support
292
+
293
+ For issues and feature requests, please use the [GitHub issue tracker](https://github.com/Haven-hvn/haven-vlm-engine-package/issues).
294
+
295
+ For questions and discussions, join our community:
296
+ - Discord: [Link to Discord](https://discord.gg/57mPMDfQew)
297
+
298
+ ---
299
+
300
+ **Note**: This package requires an OpenAI-compatible VLM endpoint. Options include:
301
+
302
+ ### Remote Services
303
+ - Any OpenAI-compatible API endpoint
304
+ - Akash deployment - https://github.com/Haven-hvn/haven-inference
305
+
306
+ ### Local Setup
307
+ - [LM Studio](https://lmstudio.ai/) - Easy local VLM hosting with OpenAI-compatible API
308
+
309
+ The package **does not** load VLM models directly - it communicates with external VLM services via API.
@@ -0,0 +1,276 @@
1
+ # VLM Engine
2
+
3
+ A high-performance Python package for Vision-Language Model (VLM) based content tagging and analysis. This package provides an advanced implementation for automatic content detection and tagging, delivering superior accuracy compared to traditional image classification methods.
4
+
5
+ ## Features
6
+
7
+ - **Remote VLM Integration**: Connects to any OpenAI-compatible VLM endpoint (no local model loading required)
8
+ - **Context-Aware Detection**: Leverages Vision-Language Models' understanding of visual relationships for accurate content tagging
9
+ - **Flexible Architecture**: Modular pipeline system with configurable models and processing stages
10
+ - **Asynchronous Processing**: Built on asyncio for efficient video and image processing
11
+ - **Customizable Tag Sets**: Easy configuration of detection categories and confidence thresholds
12
+ - **Production Ready**: Includes retry logic, error handling, and comprehensive logging
13
+
14
+ ## Installation
15
+
16
+ ### From PyPI (when published)
17
+ ```bash
18
+ pip install vlm-engine
19
+ ```
20
+
21
+ ### From Source
22
+ ```bash
23
+ git clone https://github.com/Haven-hvn/haven-vlm-engine-package.git
24
+ cd vlm-engine-package
25
+ pip install -e .
26
+ ```
27
+
28
+ ### Requirements
29
+ - Python 3.8+
30
+ - **Sufficient RAM**: Video preprocessing loads entire videos into memory (not GPU memory)
31
+ - Compatible VLM server endpoint:
32
+ - Remote OpenAI-compatible API (recommended)
33
+ - Local server using [LM Studio](https://lmstudio.ai/)
34
+ - Haven's custom VLM available at [https://havenmodels.orbiter.website/](https://havenmodels.orbiter.website/)
35
+
36
+ ## Quick Start
37
+
38
+ ```python
39
+ import asyncio
40
+ from vlm_engine import VLMEngine
41
+ from vlm_engine.config_models import EngineConfig, ModelConfig
42
+
43
+ # Configure the engine
44
+ config = EngineConfig(
45
+ active_ai_models=["vlm_nsfw_model"],
46
+ models={
47
+ "vlm_nsfw_model": ModelConfig(
48
+ type="vlm_model",
49
+ model_id="HuggingFaceTB/SmolVLM-Instruct",
50
+ api_base_url="http://localhost:7045",
51
+ tag_list=["tag1", "tag2", "tag3"] # Your custom tags
52
+ )
53
+ }
54
+ )
55
+
56
+ # Initialize and use
57
+ async def main():
58
+ engine = VLMEngine(config)
59
+ await engine.initialize()
60
+
61
+ results = await engine.process_video(
62
+ "path/to/video.mp4",
63
+ frame_interval=2.0,
64
+ threshold=0.5
65
+ )
66
+ print(f"Detected tags: {results}")
67
+
68
+ asyncio.run(main())
69
+ ```
70
+
71
+ ## Architecture
72
+
73
+ ### Core Components
74
+
75
+ 1. **VLMEngine**: Main entry point for the package
76
+ - Manages model initialization and pipeline execution
77
+ - Handles asynchronous processing of videos and images
78
+
79
+ 2. **VLMClient**: OpenAI-compatible API client
80
+ - Supports any VLM with chat completions endpoint
81
+ - Includes retry logic with exponential backoff and jitter
82
+ - Handles image encoding and prompt formatting
83
+
84
+ 3. **Pipeline System**: Flexible processing pipeline
85
+ - Modular design allows custom processing stages
86
+ - Built-in support for preprocessing, analysis, and postprocessing
87
+ - Configurable through YAML or Python objects
88
+
89
+ 4. **Model Management**: Dynamic model loading
90
+ - Supports multiple model types (VLM, preprocessors, postprocessors)
91
+ - Lazy loading for efficient resource usage
92
+ - Thread-safe model access
93
+
94
+ ## Configuration
95
+
96
+ ### Basic Configuration
97
+
98
+ ```python
99
+ from vlm_engine.config_models import EngineConfig, ModelConfig, PipelineConfig
100
+
101
+ config = EngineConfig(
102
+ active_ai_models=["my_vlm_model"],
103
+ models={
104
+ "my_vlm_model": ModelConfig(
105
+ type="vlm_model",
106
+ model_id="model-name",
107
+ api_base_url="http://localhost:8000",
108
+ tag_list=["action1", "action2", "action3"],
109
+ max_new_tokens=128,
110
+ request_timeout=70,
111
+ vlm_detected_tag_confidence=0.99
112
+ )
113
+ },
114
+ pipelines={
115
+ "video_pipeline": PipelineConfig(
116
+ inputs=["video_path", "frame_interval"],
117
+ output="results",
118
+ models=[{"name": "my_vlm_model", "inputs": ["frame"], "outputs": "tags"}]
119
+ )
120
+ }
121
+ )
122
+ ```
123
+
124
+ ### Advanced Configuration
125
+
126
+ The package supports complex configurations including:
127
+ - Multiple models in a pipeline
128
+ - Custom preprocessing and postprocessing stages
129
+ - Category-specific settings (thresholds, durations, etc.)
130
+ - Batch processing configurations
131
+
132
+ See the [examples](examples/) directory for detailed configuration examples.
133
+
134
+ ## API Reference
135
+
136
+ ### VLMEngine
137
+
138
+ ```python
139
+ class VLMEngine:
140
+ def __init__(self, config: EngineConfig)
141
+ async def initialize()
142
+ async def process_video(video_path: str, **kwargs) -> Dict[str, Any]
143
+ ```
144
+
145
+ ### Processing Parameters
146
+
147
+ - `video_path`: Path to the video file
148
+ - `frame_interval`: Seconds between frame samples (default: 0.5)
149
+ - `threshold`: Confidence threshold for tag detection (default: 0.5)
150
+ - `return_timestamps`: Include timestamp information (default: True)
151
+ - `return_confidence`: Include confidence scores (default: True)
152
+
153
+ ## Performance Optimization
154
+
155
+ ### Memory Requirements
156
+ - **Important**: Video preprocessing loads the entire video into system RAM (not GPU memory)
157
+ - Ensure sufficient RAM for your video sizes (e.g., a 1GB video may require 4-8GB of available RAM)
158
+ - Consider processing videos in segments for very large files
159
+
160
+ ### API Optimization
161
+ - Configure retry settings based on your VLM server's capacity
162
+ - Adjust `max_new_tokens` to balance speed vs accuracy
163
+ - Use appropriate `frame_interval` to reduce processing time and API calls
164
+
165
+ ### Processing Speed
166
+ - Increase `frame_interval` to sample fewer frames (faster but less accurate)
167
+ - Use batch processing when your VLM endpoint supports it
168
+ - Consider running multiple VLM instances for parallel processing
169
+
170
+ ## Extending the Package
171
+
172
+ ### Custom Models
173
+
174
+ Create custom model classes by inheriting from the base Model class:
175
+
176
+ ```python
177
+ from vlm_engine.models import Model
178
+
179
+ class CustomModel(Model):
180
+ async def process(self, inputs):
181
+ # Your custom processing logic
182
+ return results
183
+ ```
184
+
185
+ ### Custom Pipelines
186
+
187
+ Define custom pipelines for specific use cases:
188
+
189
+ ```python
190
+ custom_pipeline = PipelineConfig(
191
+ inputs=["image_path"],
192
+ output="analysis",
193
+ models=[
194
+ {"name": "preprocessor", "inputs": ["image_path"], "outputs": "processed_image"},
195
+ {"name": "analyzer", "inputs": ["processed_image"], "outputs": "analysis"}
196
+ ]
197
+ )
198
+ ```
199
+
200
+ ## Troubleshooting
201
+
202
+ ### Common Issues
203
+
204
+ 1. **Connection Errors**
205
+ - Ensure your VLM server is running and accessible
206
+ - Check the `api_base_url` configuration
207
+ - Verify firewall settings
208
+
209
+ 2. **GPU Memory Errors**
210
+ - Reduce batch size or frame interval
211
+ - Ensure proper CUDA installation
212
+ - Check GPU memory availability
213
+
214
+ 3. **Slow Processing**
215
+ - Increase frame interval for faster processing
216
+ - Use GPU acceleration if available
217
+ - Optimize VLM server settings
218
+
219
+ ### Logging
220
+
221
+ Enable debug logging for troubleshooting:
222
+
223
+ ```python
224
+ import logging
225
+ logging.basicConfig(level=logging.DEBUG)
226
+ ```
227
+
228
+ ## Contributing
229
+
230
+ Contributions are welcome! Please feel free to submit a Pull Request. For major changes, please open an issue first to discuss what you would like to change.
231
+
232
+ ### Development Setup
233
+
234
+ ```bash
235
+ git clone https://github.com/yourusername/vlm-engine.git
236
+ cd vlm-engine
237
+ pip install -e ".[dev]"
238
+ ```
239
+
240
+ ### Running Tests
241
+
242
+ ```bash
243
+ pytest tests/
244
+ ```
245
+
246
+ ## License
247
+
248
+ This project is licensed under the MIT License - see the [LICENSE](../LICENSE) file for details.
249
+
250
+ ## Acknowledgments
251
+
252
+ - Built on top of modern Python async patterns
253
+ - Inspired by production ML serving architectures
254
+ - Haven's custom VLM models trained using [SmolVLM-Finetune](https://github.com/Haven-hvn/SmolVLM-Finetune) - Model Download found on [https://havenmodels.orbiter.website/](https://havenmodels.orbiter.website/)
255
+
256
+ - Designed for integration with OpenAI-compatible VLM endpoints
257
+
258
+ ## Support
259
+
260
+ For issues and feature requests, please use the [GitHub issue tracker](https://github.com/Haven-hvn/haven-vlm-engine-package/issues).
261
+
262
+ For questions and discussions, join our community:
263
+ - Discord: [Link to Discord](https://discord.gg/57mPMDfQew)
264
+
265
+ ---
266
+
267
+ **Note**: This package requires an OpenAI-compatible VLM endpoint. Options include:
268
+
269
+ ### Remote Services
270
+ - Any OpenAI-compatible API endpoint
271
+ - Akash deployment - https://github.com/Haven-hvn/haven-inference
272
+
273
+ ### Local Setup
274
+ - [LM Studio](https://lmstudio.ai/) - Easy local VLM hosting with OpenAI-compatible API
275
+
276
+ The package **does not** load VLM models directly - it communicates with external VLM services via API.
@@ -0,0 +1,4 @@
1
+ [egg_info]
2
+ tag_build =
3
+ tag_date = 0
4
+
@@ -0,0 +1,32 @@
1
+ from setuptools import setup, find_packages
2
+
3
+ setup(
4
+ name="vlm_engine",
5
+ version="0.1.0",
6
+ description="Advanced Vision-Language Model Engine for content tagging",
7
+ long_description=open("README.md").read(),
8
+ long_description_content_type="text/markdown",
9
+ author="HAVEN Network",
10
+ author_email="officialhavennetwork@gmail.com",
11
+ url="https://github.com/Haven-hvn/haven-vlm-engine-package",
12
+ packages=find_packages(),
13
+ install_requires=[
14
+ "pydantic",
15
+ "numpy",
16
+ "torch",
17
+ "torchvision",
18
+ "aiohttp",
19
+ "pyyaml",
20
+ "opencv-python"
21
+ ],
22
+ classifiers=[
23
+ "Development Status :: 3 - Alpha",
24
+ "Intended Audience :: Developers",
25
+ "License :: OSI Approved :: MIT License",
26
+ "Programming Language :: Python :: 3",
27
+ "Programming Language :: Python :: 3.8",
28
+ "Programming Language :: Python :: 3.9",
29
+ "Programming Language :: Python :: 3.10",
30
+ ],
31
+ python_requires=">=3.8",
32
+ )
@@ -0,0 +1,11 @@
1
+ """HAVEN VLM Engine: Advanced Content Tagging System
2
+
3
+ This package provides an advanced Vision-Language Model implementation
4
+ for automatic content tagging, delivering superior accuracy compared
5
+ to traditional image classification methods.
6
+ """
7
+
8
+ from .engine import VLMEngine
9
+
10
+ __all__ = ["VLMEngine"]
11
+ __version__ = "0.1.0"