VidChain 0.2.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- vidchain-0.2.0/LICENSE +21 -0
- vidchain-0.2.0/PKG-INFO +184 -0
- vidchain-0.2.0/README.md +146 -0
- vidchain-0.2.0/VidChain.egg-info/PKG-INFO +184 -0
- vidchain-0.2.0/VidChain.egg-info/SOURCES.txt +26 -0
- vidchain-0.2.0/VidChain.egg-info/dependency_links.txt +1 -0
- vidchain-0.2.0/VidChain.egg-info/entry_points.txt +3 -0
- vidchain-0.2.0/VidChain.egg-info/requires.txt +18 -0
- vidchain-0.2.0/VidChain.egg-info/top_level.txt +1 -0
- vidchain-0.2.0/pyproject.toml +65 -0
- vidchain-0.2.0/setup.cfg +4 -0
- vidchain-0.2.0/vidchain/__init__.py +7 -0
- vidchain-0.2.0/vidchain/cli.py +106 -0
- vidchain-0.2.0/vidchain/core/__init__.py +0 -0
- vidchain-0.2.0/vidchain/core/fusion.py +62 -0
- vidchain-0.2.0/vidchain/core/ollama_engine.py +34 -0
- vidchain-0.2.0/vidchain/llm/__init__.py +0 -0
- vidchain-0.2.0/vidchain/loaders/__init__.py +0 -0
- vidchain-0.2.0/vidchain/loaders/audio_loader.py +18 -0
- vidchain-0.2.0/vidchain/loaders/video_loader.py +105 -0
- vidchain-0.2.0/vidchain/processor.py +115 -0
- vidchain-0.2.0/vidchain/processors/__init__.py +0 -0
- vidchain-0.2.0/vidchain/processors/audio_model.py +50 -0
- vidchain-0.2.0/vidchain/processors/ocr_model.py +77 -0
- vidchain-0.2.0/vidchain/processors/vision_model.py +180 -0
- vidchain-0.2.0/vidchain/rag.py +187 -0
- vidchain-0.2.0/vidchain/storage/__init__.py +0 -0
- vidchain-0.2.0/vidchain/vision.py +34 -0
vidchain-0.2.0/LICENSE
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 Rahul Sharma
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
vidchain-0.2.0/PKG-INFO
ADDED
|
@@ -0,0 +1,184 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: VidChain
|
|
3
|
+
Version: 0.2.0
|
|
4
|
+
Summary: A Lightweight Video RAG Framework for Multimodal Reasoning
|
|
5
|
+
Author-email: Rahul Sharma <rahulsharma.hps@gmail.com>
|
|
6
|
+
License: MIT
|
|
7
|
+
Project-URL: Homepage, https://github.com/rahulsiiitm/videochain-python
|
|
8
|
+
Project-URL: Bug Tracker, https://github.com/rahulsiiitm/videochain-python/issues
|
|
9
|
+
Keywords: video-rag,multimodal,ai,computer-vision,whisper,ollama,yolo
|
|
10
|
+
Classifier: Development Status :: 3 - Alpha
|
|
11
|
+
Classifier: Intended Audience :: Developers
|
|
12
|
+
Classifier: Intended Audience :: Science/Research
|
|
13
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
14
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
15
|
+
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
|
16
|
+
Requires-Python: >=3.11
|
|
17
|
+
Description-Content-Type: text/markdown
|
|
18
|
+
License-File: LICENSE
|
|
19
|
+
Requires-Dist: opencv-python
|
|
20
|
+
Requires-Dist: scenedetect
|
|
21
|
+
Requires-Dist: openai-whisper
|
|
22
|
+
Requires-Dist: sentence-transformers
|
|
23
|
+
Requires-Dist: faiss-cpu
|
|
24
|
+
Requires-Dist: litellm
|
|
25
|
+
Requires-Dist: google-generativeai
|
|
26
|
+
Requires-Dist: moviepy
|
|
27
|
+
Requires-Dist: pillow<12.0
|
|
28
|
+
Requires-Dist: torch
|
|
29
|
+
Requires-Dist: torchvision
|
|
30
|
+
Requires-Dist: torchaudio
|
|
31
|
+
Requires-Dist: numpy
|
|
32
|
+
Requires-Dist: tqdm
|
|
33
|
+
Requires-Dist: python-dotenv
|
|
34
|
+
Requires-Dist: ultralytics
|
|
35
|
+
Requires-Dist: librosa
|
|
36
|
+
Requires-Dist: easyocr
|
|
37
|
+
Dynamic: license-file
|
|
38
|
+
|
|
39
|
+
# VidChain: Video Intelligence RAG Framework
|
|
40
|
+
> Edge-optimized multimodal RAG framework for video understanding — transforms raw footage into a structured, queryable knowledge base.
|
|
41
|
+
|
|
42
|
+
    [](https://pypi.org/project/VidChain/)
|
|
43
|
+
|
|
44
|
+
---
|
|
45
|
+
|
|
46
|
+
## Overview
|
|
47
|
+
|
|
48
|
+
vidchain v0.2.0 is a lightweight, modular framework that combines computer vision, smart OCR, speech recognition, and LLM reasoning into a unified **late-fusion pipeline**. Designed to run efficiently on consumer-grade GPUs (tested on NVIDIA RTX 3050), it extracts human-readable stories from raw sensor data, making on-device video intelligence practical without massive cloud dependency.
|
|
49
|
+
|
|
50
|
+
At the heart of the framework is **B.A.B.U.R.A.O.** (*Behavioral Analysis & Broadcasting Unit for Real-time Artificial Observation*), an elite AI copilot that uses abductive reasoning to translate raw, flickering object/action logs into flowing, conversational narratives.
|
|
51
|
+
|
|
52
|
+
---
|
|
53
|
+
|
|
54
|
+
## Core Pipeline
|
|
55
|
+
```text
|
|
56
|
+
Video Input → Adaptive Keyframes → Dual-Brain Vision (YOLO + MobileNet) + OCR → Audio Transcription → Semantic Chunking → FAISS Vector DB → B.A.B.U.R.A.O. RAG
|
|
57
|
+
````
|
|
58
|
+
|
|
59
|
+
-----
|
|
60
|
+
|
|
61
|
+
## Key Capabilities
|
|
62
|
+
|
|
63
|
+
### 🧠 Dual-Brain Vision Engine
|
|
64
|
+
|
|
65
|
+
Instead of basic classification, vidchain uses a two-pronged visual approach:
|
|
66
|
+
|
|
67
|
+
* **The "Noun" Engine (YOLOv8):** Detects specific objects (e.g., "1 person, 2 laptops").
|
|
68
|
+
* **The "Verb" Engine (MobileNetV3):** Classifies the intent or state of the scene (e.g., NORMAL, SUSPICIOUS, VIOLENCE).
|
|
69
|
+
|
|
70
|
+
### 🔤 Context-Aware OCR
|
|
71
|
+
|
|
72
|
+
Powered by EasyOCR, the system intelligently scans for text *only* when YOLO detects readable surfaces (monitors, laptops, books, whiteboards), saving massive compute power while capturing ground-truth data (e.g., reading the brand "ASUS Vivobook" off a laptop).
|
|
73
|
+
|
|
74
|
+
### B.A.B.U.R.A.O. RAG Engine (Conversational)
|
|
75
|
+
|
|
76
|
+
Unlike standard RAGs that read out robotic timelines, B.A.B.U.R.A.O. acts as a human copilot:
|
|
77
|
+
|
|
78
|
+
* **Abductive Reasoning:** If it sees a "laptop" and a "keyboard", it deduces the scene is a "computer desk."
|
|
79
|
+
* **Sensor Filtering:** Automatically ignores momentary hardware glitches/hallucinations (e.g., a TV briefly misidentified as an oven).
|
|
80
|
+
* **Natural Translation:** Translates raw model labels like `VIOLENCE` into contextual human behaviors like "the person became visibly frustrated and hit the desk."
|
|
81
|
+
|
|
82
|
+
### Edge-First GPU Optimization
|
|
83
|
+
|
|
84
|
+
Engineered to prevent VRAM crashes. Smart memory routing disables PyTorch's buggy layer fusion during YOLO inference and safely manages VRAM across concurrent vision, audio, and language models.
|
|
85
|
+
|
|
86
|
+
-----
|
|
87
|
+
|
|
88
|
+
## Installation
|
|
89
|
+
|
|
90
|
+
```bash
|
|
91
|
+
# 1. Install the core package
|
|
92
|
+
pip install vidchain
|
|
93
|
+
|
|
94
|
+
# 2. IMPORTANT: Install GPU-accelerated PyTorch (CUDA 12.1 recommended)
|
|
95
|
+
pip install torch torchvision torchaudio --index-url [https://download.pytorch.org/whl/cu121](https://download.pytorch.org/whl/cu121) --force-reinstall
|
|
96
|
+
```
|
|
97
|
+
|
|
98
|
+
> ⚠️ **Requirement:** NVIDIA drivers and CUDA are strongly recommended. To verify your hardware is correctly mapped, run the built-in diagnostic script: `python scripts/check_gpu.py`
|
|
99
|
+
|
|
100
|
+
-----
|
|
101
|
+
|
|
102
|
+
## Quick Start
|
|
103
|
+
|
|
104
|
+
### 1 — Analyze a Video (Build Knowledge Base)
|
|
105
|
+
|
|
106
|
+
Analyze a video file, extract multimodal context, and generate a structured JSON timeline:
|
|
107
|
+
|
|
108
|
+
```bash
|
|
109
|
+
vidchain-analyze sample.mp4
|
|
110
|
+
```
|
|
111
|
+
|
|
112
|
+
*This command automatically builds a FAISS index and drops you into the interactive B.A.B.U.R.A.O. chat terminal.*
|
|
113
|
+
|
|
114
|
+
### 2 — Train the Action Engine
|
|
115
|
+
|
|
116
|
+
Fine-tune the MobileNetV3 "Verb" classifier on your domain-specific dataset:
|
|
117
|
+
|
|
118
|
+
```bash
|
|
119
|
+
vidchain-train
|
|
120
|
+
```
|
|
121
|
+
|
|
122
|
+
Place labeled training images under `data/train/` before running.
|
|
123
|
+
|
|
124
|
+
-----
|
|
125
|
+
|
|
126
|
+
## Knowledge Base Schema
|
|
127
|
+
|
|
128
|
+
The framework utilizes **Semantic Chunking** to compress repetitive frames. The `knowledge_base.json` outputs a clean, fused timeline:
|
|
129
|
+
|
|
130
|
+
```json
|
|
131
|
+
{
|
|
132
|
+
"time": 0.97,
|
|
133
|
+
"type": "ocr",
|
|
134
|
+
"content": "ASUS Vivabook"
|
|
135
|
+
},
|
|
136
|
+
{
|
|
137
|
+
"time": 3.87,
|
|
138
|
+
"type": "visual",
|
|
139
|
+
"content": "Duration: [3.87s - 6.77s] | Subjects: 1 laptop, 1 tv | Action State: SUSPICIOUS"
|
|
140
|
+
},
|
|
141
|
+
{
|
|
142
|
+
"time": 19.34,
|
|
143
|
+
"type": "visual",
|
|
144
|
+
"content": "Duration: [19.34s - 19.34s] | Subjects: 1 tv | Action State: VIOLENCE"
|
|
145
|
+
}
|
|
146
|
+
```
|
|
147
|
+
|
|
148
|
+
-----
|
|
149
|
+
|
|
150
|
+
## Tech Stack
|
|
151
|
+
|
|
152
|
+
| Component | Technology |
|
|
153
|
+
|---|---|
|
|
154
|
+
| Object Detection (Nouns) | YOLOv8s |
|
|
155
|
+
| Intent Classification (Verbs) | MobileNetV3 (Custom fine-tuned) |
|
|
156
|
+
| Text Extraction (OCR) | EasyOCR |
|
|
157
|
+
| ASR (Audio) | OpenAI Whisper (Base) |
|
|
158
|
+
| Vector Database | FAISS + Sentence-Transformers (`all-MiniLM-L6-v2`) |
|
|
159
|
+
| LLM Routing | LiteLLM (`gemini-2.5-flash` default, Ollama supported) |
|
|
160
|
+
| GPU Runtime | CUDA 12.1 (Optimized for 4GB+ VRAM) |
|
|
161
|
+
|
|
162
|
+
-----
|
|
163
|
+
|
|
164
|
+
## Roadmap
|
|
165
|
+
|
|
166
|
+
- [ ] **Real-time streaming pipeline** — live ingestion and indexing with low-latency event detection.
|
|
167
|
+
- [ ] **Advanced temporal reasoning** — multi-clip reasoning and cross-camera subject tracking.
|
|
168
|
+
- [ ] **Interactive Dashboard** — PyQt5 HUD for video playback, timeline visualization, and KB exploration.
|
|
169
|
+
|
|
170
|
+
-----
|
|
171
|
+
|
|
172
|
+
## Contributing
|
|
173
|
+
|
|
174
|
+
Contributions, issues, and feature requests are highly welcome\! Open a GitHub issue or submit a pull request.
|
|
175
|
+
|
|
176
|
+
-----
|
|
177
|
+
|
|
178
|
+
## Author
|
|
179
|
+
|
|
180
|
+
**Rahul Sharma** — B.Tech CSE, IIIT Manipur
|
|
181
|
+
|
|
182
|
+
## License
|
|
183
|
+
|
|
184
|
+
Distributed under the [MIT License](https://www.google.com/search?q=LICENSE).
|
vidchain-0.2.0/README.md
ADDED
|
@@ -0,0 +1,146 @@
|
|
|
1
|
+
# VidChain: Video Intelligence RAG Framework
|
|
2
|
+
> Edge-optimized multimodal RAG framework for video understanding — transforms raw footage into a structured, queryable knowledge base.
|
|
3
|
+
|
|
4
|
+
    [](https://pypi.org/project/VidChain/)
|
|
5
|
+
|
|
6
|
+
---
|
|
7
|
+
|
|
8
|
+
## Overview
|
|
9
|
+
|
|
10
|
+
vidchain v0.2.0 is a lightweight, modular framework that combines computer vision, smart OCR, speech recognition, and LLM reasoning into a unified **late-fusion pipeline**. Designed to run efficiently on consumer-grade GPUs (tested on NVIDIA RTX 3050), it extracts human-readable stories from raw sensor data, making on-device video intelligence practical without massive cloud dependency.
|
|
11
|
+
|
|
12
|
+
At the heart of the framework is **B.A.B.U.R.A.O.** (*Behavioral Analysis & Broadcasting Unit for Real-time Artificial Observation*), an elite AI copilot that uses abductive reasoning to translate raw, flickering object/action logs into flowing, conversational narratives.
|
|
13
|
+
|
|
14
|
+
---
|
|
15
|
+
|
|
16
|
+
## Core Pipeline
|
|
17
|
+
```text
|
|
18
|
+
Video Input → Adaptive Keyframes → Dual-Brain Vision (YOLO + MobileNet) + OCR → Audio Transcription → Semantic Chunking → FAISS Vector DB → B.A.B.U.R.A.O. RAG
|
|
19
|
+
````
|
|
20
|
+
|
|
21
|
+
-----
|
|
22
|
+
|
|
23
|
+
## Key Capabilities
|
|
24
|
+
|
|
25
|
+
### 🧠 Dual-Brain Vision Engine
|
|
26
|
+
|
|
27
|
+
Instead of basic classification, vidchain uses a two-pronged visual approach:
|
|
28
|
+
|
|
29
|
+
* **The "Noun" Engine (YOLOv8):** Detects specific objects (e.g., "1 person, 2 laptops").
|
|
30
|
+
* **The "Verb" Engine (MobileNetV3):** Classifies the intent or state of the scene (e.g., NORMAL, SUSPICIOUS, VIOLENCE).
|
|
31
|
+
|
|
32
|
+
### 🔤 Context-Aware OCR
|
|
33
|
+
|
|
34
|
+
Powered by EasyOCR, the system intelligently scans for text *only* when YOLO detects readable surfaces (monitors, laptops, books, whiteboards), saving massive compute power while capturing ground-truth data (e.g., reading the brand "ASUS Vivobook" off a laptop).
|
|
35
|
+
|
|
36
|
+
### B.A.B.U.R.A.O. RAG Engine (Conversational)
|
|
37
|
+
|
|
38
|
+
Unlike standard RAGs that read out robotic timelines, B.A.B.U.R.A.O. acts as a human copilot:
|
|
39
|
+
|
|
40
|
+
* **Abductive Reasoning:** If it sees a "laptop" and a "keyboard", it deduces the scene is a "computer desk."
|
|
41
|
+
* **Sensor Filtering:** Automatically ignores momentary hardware glitches/hallucinations (e.g., a TV briefly misidentified as an oven).
|
|
42
|
+
* **Natural Translation:** Translates raw model labels like `VIOLENCE` into contextual human behaviors like "the person became visibly frustrated and hit the desk."
|
|
43
|
+
|
|
44
|
+
### Edge-First GPU Optimization
|
|
45
|
+
|
|
46
|
+
Engineered to prevent VRAM crashes. Smart memory routing disables PyTorch's buggy layer fusion during YOLO inference and safely manages VRAM across concurrent vision, audio, and language models.
|
|
47
|
+
|
|
48
|
+
-----
|
|
49
|
+
|
|
50
|
+
## Installation
|
|
51
|
+
|
|
52
|
+
```bash
|
|
53
|
+
# 1. Install the core package
|
|
54
|
+
pip install vidchain
|
|
55
|
+
|
|
56
|
+
# 2. IMPORTANT: Install GPU-accelerated PyTorch (CUDA 12.1 recommended)
|
|
57
|
+
pip install torch torchvision torchaudio --index-url [https://download.pytorch.org/whl/cu121](https://download.pytorch.org/whl/cu121) --force-reinstall
|
|
58
|
+
```
|
|
59
|
+
|
|
60
|
+
> ⚠️ **Requirement:** NVIDIA drivers and CUDA are strongly recommended. To verify your hardware is correctly mapped, run the built-in diagnostic script: `python scripts/check_gpu.py`
|
|
61
|
+
|
|
62
|
+
-----
|
|
63
|
+
|
|
64
|
+
## Quick Start
|
|
65
|
+
|
|
66
|
+
### 1 — Analyze a Video (Build Knowledge Base)
|
|
67
|
+
|
|
68
|
+
Analyze a video file, extract multimodal context, and generate a structured JSON timeline:
|
|
69
|
+
|
|
70
|
+
```bash
|
|
71
|
+
vidchain-analyze sample.mp4
|
|
72
|
+
```
|
|
73
|
+
|
|
74
|
+
*This command automatically builds a FAISS index and drops you into the interactive B.A.B.U.R.A.O. chat terminal.*
|
|
75
|
+
|
|
76
|
+
### 2 — Train the Action Engine
|
|
77
|
+
|
|
78
|
+
Fine-tune the MobileNetV3 "Verb" classifier on your domain-specific dataset:
|
|
79
|
+
|
|
80
|
+
```bash
|
|
81
|
+
vidchain-train
|
|
82
|
+
```
|
|
83
|
+
|
|
84
|
+
Place labeled training images under `data/train/` before running.
|
|
85
|
+
|
|
86
|
+
-----
|
|
87
|
+
|
|
88
|
+
## Knowledge Base Schema
|
|
89
|
+
|
|
90
|
+
The framework utilizes **Semantic Chunking** to compress repetitive frames. The `knowledge_base.json` outputs a clean, fused timeline:
|
|
91
|
+
|
|
92
|
+
```json
|
|
93
|
+
{
|
|
94
|
+
"time": 0.97,
|
|
95
|
+
"type": "ocr",
|
|
96
|
+
"content": "ASUS Vivabook"
|
|
97
|
+
},
|
|
98
|
+
{
|
|
99
|
+
"time": 3.87,
|
|
100
|
+
"type": "visual",
|
|
101
|
+
"content": "Duration: [3.87s - 6.77s] | Subjects: 1 laptop, 1 tv | Action State: SUSPICIOUS"
|
|
102
|
+
},
|
|
103
|
+
{
|
|
104
|
+
"time": 19.34,
|
|
105
|
+
"type": "visual",
|
|
106
|
+
"content": "Duration: [19.34s - 19.34s] | Subjects: 1 tv | Action State: VIOLENCE"
|
|
107
|
+
}
|
|
108
|
+
```
|
|
109
|
+
|
|
110
|
+
-----
|
|
111
|
+
|
|
112
|
+
## Tech Stack
|
|
113
|
+
|
|
114
|
+
| Component | Technology |
|
|
115
|
+
|---|---|
|
|
116
|
+
| Object Detection (Nouns) | YOLOv8s |
|
|
117
|
+
| Intent Classification (Verbs) | MobileNetV3 (Custom fine-tuned) |
|
|
118
|
+
| Text Extraction (OCR) | EasyOCR |
|
|
119
|
+
| ASR (Audio) | OpenAI Whisper (Base) |
|
|
120
|
+
| Vector Database | FAISS + Sentence-Transformers (`all-MiniLM-L6-v2`) |
|
|
121
|
+
| LLM Routing | LiteLLM (`gemini-2.5-flash` default, Ollama supported) |
|
|
122
|
+
| GPU Runtime | CUDA 12.1 (Optimized for 4GB+ VRAM) |
|
|
123
|
+
|
|
124
|
+
-----
|
|
125
|
+
|
|
126
|
+
## Roadmap
|
|
127
|
+
|
|
128
|
+
- [ ] **Real-time streaming pipeline** — live ingestion and indexing with low-latency event detection.
|
|
129
|
+
- [ ] **Advanced temporal reasoning** — multi-clip reasoning and cross-camera subject tracking.
|
|
130
|
+
- [ ] **Interactive Dashboard** — PyQt5 HUD for video playback, timeline visualization, and KB exploration.
|
|
131
|
+
|
|
132
|
+
-----
|
|
133
|
+
|
|
134
|
+
## Contributing
|
|
135
|
+
|
|
136
|
+
Contributions, issues, and feature requests are highly welcome\! Open a GitHub issue or submit a pull request.
|
|
137
|
+
|
|
138
|
+
-----
|
|
139
|
+
|
|
140
|
+
## Author
|
|
141
|
+
|
|
142
|
+
**Rahul Sharma** — B.Tech CSE, IIIT Manipur
|
|
143
|
+
|
|
144
|
+
## License
|
|
145
|
+
|
|
146
|
+
Distributed under the [MIT License](https://www.google.com/search?q=LICENSE).
|
|
@@ -0,0 +1,184 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: VidChain
|
|
3
|
+
Version: 0.2.0
|
|
4
|
+
Summary: A Lightweight Video RAG Framework for Multimodal Reasoning
|
|
5
|
+
Author-email: Rahul Sharma <rahulsharma.hps@gmail.com>
|
|
6
|
+
License: MIT
|
|
7
|
+
Project-URL: Homepage, https://github.com/rahulsiiitm/videochain-python
|
|
8
|
+
Project-URL: Bug Tracker, https://github.com/rahulsiiitm/videochain-python/issues
|
|
9
|
+
Keywords: video-rag,multimodal,ai,computer-vision,whisper,ollama,yolo
|
|
10
|
+
Classifier: Development Status :: 3 - Alpha
|
|
11
|
+
Classifier: Intended Audience :: Developers
|
|
12
|
+
Classifier: Intended Audience :: Science/Research
|
|
13
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
14
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
15
|
+
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
|
16
|
+
Requires-Python: >=3.11
|
|
17
|
+
Description-Content-Type: text/markdown
|
|
18
|
+
License-File: LICENSE
|
|
19
|
+
Requires-Dist: opencv-python
|
|
20
|
+
Requires-Dist: scenedetect
|
|
21
|
+
Requires-Dist: openai-whisper
|
|
22
|
+
Requires-Dist: sentence-transformers
|
|
23
|
+
Requires-Dist: faiss-cpu
|
|
24
|
+
Requires-Dist: litellm
|
|
25
|
+
Requires-Dist: google-generativeai
|
|
26
|
+
Requires-Dist: moviepy
|
|
27
|
+
Requires-Dist: pillow<12.0
|
|
28
|
+
Requires-Dist: torch
|
|
29
|
+
Requires-Dist: torchvision
|
|
30
|
+
Requires-Dist: torchaudio
|
|
31
|
+
Requires-Dist: numpy
|
|
32
|
+
Requires-Dist: tqdm
|
|
33
|
+
Requires-Dist: python-dotenv
|
|
34
|
+
Requires-Dist: ultralytics
|
|
35
|
+
Requires-Dist: librosa
|
|
36
|
+
Requires-Dist: easyocr
|
|
37
|
+
Dynamic: license-file
|
|
38
|
+
|
|
39
|
+
# VidChain: Video Intelligence RAG Framework
|
|
40
|
+
> Edge-optimized multimodal RAG framework for video understanding — transforms raw footage into a structured, queryable knowledge base.
|
|
41
|
+
|
|
42
|
+
    [](https://pypi.org/project/VidChain/)
|
|
43
|
+
|
|
44
|
+
---
|
|
45
|
+
|
|
46
|
+
## Overview
|
|
47
|
+
|
|
48
|
+
vidchain v0.2.0 is a lightweight, modular framework that combines computer vision, smart OCR, speech recognition, and LLM reasoning into a unified **late-fusion pipeline**. Designed to run efficiently on consumer-grade GPUs (tested on NVIDIA RTX 3050), it extracts human-readable stories from raw sensor data, making on-device video intelligence practical without massive cloud dependency.
|
|
49
|
+
|
|
50
|
+
At the heart of the framework is **B.A.B.U.R.A.O.** (*Behavioral Analysis & Broadcasting Unit for Real-time Artificial Observation*), an elite AI copilot that uses abductive reasoning to translate raw, flickering object/action logs into flowing, conversational narratives.
|
|
51
|
+
|
|
52
|
+
---
|
|
53
|
+
|
|
54
|
+
## Core Pipeline
|
|
55
|
+
```text
|
|
56
|
+
Video Input → Adaptive Keyframes → Dual-Brain Vision (YOLO + MobileNet) + OCR → Audio Transcription → Semantic Chunking → FAISS Vector DB → B.A.B.U.R.A.O. RAG
|
|
57
|
+
````
|
|
58
|
+
|
|
59
|
+
-----
|
|
60
|
+
|
|
61
|
+
## Key Capabilities
|
|
62
|
+
|
|
63
|
+
### 🧠 Dual-Brain Vision Engine
|
|
64
|
+
|
|
65
|
+
Instead of basic classification, vidchain uses a two-pronged visual approach:
|
|
66
|
+
|
|
67
|
+
* **The "Noun" Engine (YOLOv8):** Detects specific objects (e.g., "1 person, 2 laptops").
|
|
68
|
+
* **The "Verb" Engine (MobileNetV3):** Classifies the intent or state of the scene (e.g., NORMAL, SUSPICIOUS, VIOLENCE).
|
|
69
|
+
|
|
70
|
+
### 🔤 Context-Aware OCR
|
|
71
|
+
|
|
72
|
+
Powered by EasyOCR, the system intelligently scans for text *only* when YOLO detects readable surfaces (monitors, laptops, books, whiteboards), saving massive compute power while capturing ground-truth data (e.g., reading the brand "ASUS Vivobook" off a laptop).
|
|
73
|
+
|
|
74
|
+
### B.A.B.U.R.A.O. RAG Engine (Conversational)
|
|
75
|
+
|
|
76
|
+
Unlike standard RAGs that read out robotic timelines, B.A.B.U.R.A.O. acts as a human copilot:
|
|
77
|
+
|
|
78
|
+
* **Abductive Reasoning:** If it sees a "laptop" and a "keyboard", it deduces the scene is a "computer desk."
|
|
79
|
+
* **Sensor Filtering:** Automatically ignores momentary hardware glitches/hallucinations (e.g., a TV briefly misidentified as an oven).
|
|
80
|
+
* **Natural Translation:** Translates raw model labels like `VIOLENCE` into contextual human behaviors like "the person became visibly frustrated and hit the desk."
|
|
81
|
+
|
|
82
|
+
### Edge-First GPU Optimization
|
|
83
|
+
|
|
84
|
+
Engineered to prevent VRAM crashes. Smart memory routing disables PyTorch's buggy layer fusion during YOLO inference and safely manages VRAM across concurrent vision, audio, and language models.
|
|
85
|
+
|
|
86
|
+
-----
|
|
87
|
+
|
|
88
|
+
## Installation
|
|
89
|
+
|
|
90
|
+
```bash
|
|
91
|
+
# 1. Install the core package
|
|
92
|
+
pip install vidchain
|
|
93
|
+
|
|
94
|
+
# 2. IMPORTANT: Install GPU-accelerated PyTorch (CUDA 12.1 recommended)
|
|
95
|
+
pip install torch torchvision torchaudio --index-url [https://download.pytorch.org/whl/cu121](https://download.pytorch.org/whl/cu121) --force-reinstall
|
|
96
|
+
```
|
|
97
|
+
|
|
98
|
+
> ⚠️ **Requirement:** NVIDIA drivers and CUDA are strongly recommended. To verify your hardware is correctly mapped, run the built-in diagnostic script: `python scripts/check_gpu.py`
|
|
99
|
+
|
|
100
|
+
-----
|
|
101
|
+
|
|
102
|
+
## Quick Start
|
|
103
|
+
|
|
104
|
+
### 1 — Analyze a Video (Build Knowledge Base)
|
|
105
|
+
|
|
106
|
+
Analyze a video file, extract multimodal context, and generate a structured JSON timeline:
|
|
107
|
+
|
|
108
|
+
```bash
|
|
109
|
+
vidchain-analyze sample.mp4
|
|
110
|
+
```
|
|
111
|
+
|
|
112
|
+
*This command automatically builds a FAISS index and drops you into the interactive B.A.B.U.R.A.O. chat terminal.*
|
|
113
|
+
|
|
114
|
+
### 2 — Train the Action Engine
|
|
115
|
+
|
|
116
|
+
Fine-tune the MobileNetV3 "Verb" classifier on your domain-specific dataset:
|
|
117
|
+
|
|
118
|
+
```bash
|
|
119
|
+
vidchain-train
|
|
120
|
+
```
|
|
121
|
+
|
|
122
|
+
Place labeled training images under `data/train/` before running.
|
|
123
|
+
|
|
124
|
+
-----
|
|
125
|
+
|
|
126
|
+
## Knowledge Base Schema
|
|
127
|
+
|
|
128
|
+
The framework utilizes **Semantic Chunking** to compress repetitive frames. The `knowledge_base.json` outputs a clean, fused timeline:
|
|
129
|
+
|
|
130
|
+
```json
|
|
131
|
+
{
|
|
132
|
+
"time": 0.97,
|
|
133
|
+
"type": "ocr",
|
|
134
|
+
"content": "ASUS Vivabook"
|
|
135
|
+
},
|
|
136
|
+
{
|
|
137
|
+
"time": 3.87,
|
|
138
|
+
"type": "visual",
|
|
139
|
+
"content": "Duration: [3.87s - 6.77s] | Subjects: 1 laptop, 1 tv | Action State: SUSPICIOUS"
|
|
140
|
+
},
|
|
141
|
+
{
|
|
142
|
+
"time": 19.34,
|
|
143
|
+
"type": "visual",
|
|
144
|
+
"content": "Duration: [19.34s - 19.34s] | Subjects: 1 tv | Action State: VIOLENCE"
|
|
145
|
+
}
|
|
146
|
+
```
|
|
147
|
+
|
|
148
|
+
-----
|
|
149
|
+
|
|
150
|
+
## Tech Stack
|
|
151
|
+
|
|
152
|
+
| Component | Technology |
|
|
153
|
+
|---|---|
|
|
154
|
+
| Object Detection (Nouns) | YOLOv8s |
|
|
155
|
+
| Intent Classification (Verbs) | MobileNetV3 (Custom fine-tuned) |
|
|
156
|
+
| Text Extraction (OCR) | EasyOCR |
|
|
157
|
+
| ASR (Audio) | OpenAI Whisper (Base) |
|
|
158
|
+
| Vector Database | FAISS + Sentence-Transformers (`all-MiniLM-L6-v2`) |
|
|
159
|
+
| LLM Routing | LiteLLM (`gemini-2.5-flash` default, Ollama supported) |
|
|
160
|
+
| GPU Runtime | CUDA 12.1 (Optimized for 4GB+ VRAM) |
|
|
161
|
+
|
|
162
|
+
-----
|
|
163
|
+
|
|
164
|
+
## Roadmap
|
|
165
|
+
|
|
166
|
+
- [ ] **Real-time streaming pipeline** — live ingestion and indexing with low-latency event detection.
|
|
167
|
+
- [ ] **Advanced temporal reasoning** — multi-clip reasoning and cross-camera subject tracking.
|
|
168
|
+
- [ ] **Interactive Dashboard** — PyQt5 HUD for video playback, timeline visualization, and KB exploration.
|
|
169
|
+
|
|
170
|
+
-----
|
|
171
|
+
|
|
172
|
+
## Contributing
|
|
173
|
+
|
|
174
|
+
Contributions, issues, and feature requests are highly welcome\! Open a GitHub issue or submit a pull request.
|
|
175
|
+
|
|
176
|
+
-----
|
|
177
|
+
|
|
178
|
+
## Author
|
|
179
|
+
|
|
180
|
+
**Rahul Sharma** — B.Tech CSE, IIIT Manipur
|
|
181
|
+
|
|
182
|
+
## License
|
|
183
|
+
|
|
184
|
+
Distributed under the [MIT License](https://www.google.com/search?q=LICENSE).
|
|
@@ -0,0 +1,26 @@
|
|
|
1
|
+
LICENSE
|
|
2
|
+
README.md
|
|
3
|
+
pyproject.toml
|
|
4
|
+
VidChain.egg-info/PKG-INFO
|
|
5
|
+
VidChain.egg-info/SOURCES.txt
|
|
6
|
+
VidChain.egg-info/dependency_links.txt
|
|
7
|
+
VidChain.egg-info/entry_points.txt
|
|
8
|
+
VidChain.egg-info/requires.txt
|
|
9
|
+
VidChain.egg-info/top_level.txt
|
|
10
|
+
vidchain/__init__.py
|
|
11
|
+
vidchain/cli.py
|
|
12
|
+
vidchain/processor.py
|
|
13
|
+
vidchain/rag.py
|
|
14
|
+
vidchain/vision.py
|
|
15
|
+
vidchain/core/__init__.py
|
|
16
|
+
vidchain/core/fusion.py
|
|
17
|
+
vidchain/core/ollama_engine.py
|
|
18
|
+
vidchain/llm/__init__.py
|
|
19
|
+
vidchain/loaders/__init__.py
|
|
20
|
+
vidchain/loaders/audio_loader.py
|
|
21
|
+
vidchain/loaders/video_loader.py
|
|
22
|
+
vidchain/processors/__init__.py
|
|
23
|
+
vidchain/processors/audio_model.py
|
|
24
|
+
vidchain/processors/ocr_model.py
|
|
25
|
+
vidchain/processors/vision_model.py
|
|
26
|
+
vidchain/storage/__init__.py
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
|
|
@@ -0,0 +1,18 @@
|
|
|
1
|
+
opencv-python
|
|
2
|
+
scenedetect
|
|
3
|
+
openai-whisper
|
|
4
|
+
sentence-transformers
|
|
5
|
+
faiss-cpu
|
|
6
|
+
litellm
|
|
7
|
+
google-generativeai
|
|
8
|
+
moviepy
|
|
9
|
+
pillow<12.0
|
|
10
|
+
torch
|
|
11
|
+
torchvision
|
|
12
|
+
torchaudio
|
|
13
|
+
numpy
|
|
14
|
+
tqdm
|
|
15
|
+
python-dotenv
|
|
16
|
+
ultralytics
|
|
17
|
+
librosa
|
|
18
|
+
easyocr
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
vidchain
|
|
@@ -0,0 +1,65 @@
|
|
|
1
|
+
[build-system]
|
|
2
|
+
requires = ["setuptools>=61.0"]
|
|
3
|
+
build-backend = "setuptools.build_meta"
|
|
4
|
+
|
|
5
|
+
[project]
|
|
6
|
+
name = "VidChain"
|
|
7
|
+
version = "0.2.0"
|
|
8
|
+
description = "A Lightweight Video RAG Framework for Multimodal Reasoning"
|
|
9
|
+
readme = "README.md"
|
|
10
|
+
requires-python = ">=3.11"
|
|
11
|
+
authors = [{ name = "Rahul Sharma", email = "rahulsharma.hps@gmail.com" }]
|
|
12
|
+
license = { text = "MIT" }
|
|
13
|
+
keywords = ["video-rag", "multimodal", "ai", "computer-vision", "whisper", "ollama", "yolo"]
|
|
14
|
+
|
|
15
|
+
# Detailed Classifiers for PyPI
|
|
16
|
+
classifiers = [
|
|
17
|
+
"Development Status :: 3 - Alpha",
|
|
18
|
+
"Intended Audience :: Developers",
|
|
19
|
+
"Intended Audience :: Science/Research",
|
|
20
|
+
"License :: OSI Approved :: MIT License",
|
|
21
|
+
"Programming Language :: Python :: 3.11",
|
|
22
|
+
"Topic :: Scientific/Engineering :: Artificial Intelligence",
|
|
23
|
+
]
|
|
24
|
+
|
|
25
|
+
# Core Dependencies
|
|
26
|
+
dependencies = [
|
|
27
|
+
"opencv-python",
|
|
28
|
+
"scenedetect",
|
|
29
|
+
"openai-whisper", # Provides the 'whisper' import
|
|
30
|
+
"sentence-transformers",
|
|
31
|
+
"faiss-cpu",
|
|
32
|
+
"litellm", # Added for your 'gemini/gemini-2.5-flash' routing
|
|
33
|
+
"google-generativeai",
|
|
34
|
+
"moviepy",
|
|
35
|
+
"pillow<12.0", # 🛑 PINNED: Fixes the MoviePy incompatibility crash
|
|
36
|
+
"torch", # 🛑 UNPINNED: Allows users to install their own cu118/cu121 versions
|
|
37
|
+
"torchvision", # 🛑 UNPINNED: Prevents pip from forcing a CPU downgrade
|
|
38
|
+
"torchaudio", # Added to complete the PyTorch hardware triad
|
|
39
|
+
"numpy",
|
|
40
|
+
"tqdm",
|
|
41
|
+
"python-dotenv",
|
|
42
|
+
"ultralytics", # YOLO-based Noun Engine
|
|
43
|
+
"librosa", # Audio energy extraction
|
|
44
|
+
"easyocr", # GPU-accelerated text extraction
|
|
45
|
+
]
|
|
46
|
+
|
|
47
|
+
# --- CONSOLIDATED CLI ENTRY POINTS ---
|
|
48
|
+
[project.scripts]
|
|
49
|
+
vidchain-analyze = "vidchain.cli:main"
|
|
50
|
+
vidchain-train = "scripts.train_vision:main"
|
|
51
|
+
|
|
52
|
+
[project.urls]
|
|
53
|
+
"Homepage" = "https://github.com/rahulsiiitm/videochain-python"
|
|
54
|
+
"Bug Tracker" = "https://github.com/rahulsiiitm/videochain-python/issues"
|
|
55
|
+
|
|
56
|
+
[tool.setuptools.packages.find]
|
|
57
|
+
where = ["."]
|
|
58
|
+
include = ["vidchain*"]
|
|
59
|
+
|
|
60
|
+
[tool.setuptools.package-data]
|
|
61
|
+
vidchain = ["py.typed"]
|
|
62
|
+
|
|
63
|
+
# UV specific configuration for CUDA-enabled Torch
|
|
64
|
+
[[tool.uv.index]]
|
|
65
|
+
url = "https://download.pytorch.org/whl/cu121"
|
vidchain-0.2.0/setup.cfg
ADDED