sightrag 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sightrag/__init__.py +31 -0
- sightrag/api.py +189 -0
- sightrag/core.py +129 -0
- sightrag/detector.py +81 -0
- sightrag/embedder.py +83 -0
- sightrag/indexer.py +147 -0
- sightrag/retriever.py +83 -0
- sightrag/store/__init__.py +5 -0
- sightrag/store/base.py +18 -0
- sightrag/store/chroma_store.py +115 -0
- sightrag/store/sqlite_store.py +128 -0
- sightrag/utils/__init__.py +9 -0
- sightrag/utils/camera.py +78 -0
- sightrag/utils/image.py +48 -0
- sightrag/utils/video.py +72 -0
- sightrag-0.1.0.dist-info/METADATA +391 -0
- sightrag-0.1.0.dist-info/RECORD +20 -0
- sightrag-0.1.0.dist-info/WHEEL +5 -0
- sightrag-0.1.0.dist-info/licenses/LICENSE +125 -0
- sightrag-0.1.0.dist-info/top_level.txt +1 -0
sightrag/__init__.py
ADDED
|
@@ -0,0 +1,31 @@
|
|
|
1
|
+
"""
|
|
2
|
+
SightRAG — Image and Video RAG
|
|
3
|
+
See. Search. Retrieve.
|
|
4
|
+
|
|
5
|
+
pip install sightrag
|
|
6
|
+
|
|
7
|
+
Usage:
|
|
8
|
+
from sightrag import SightRAG
|
|
9
|
+
|
|
10
|
+
rag = SightRAG()
|
|
11
|
+
rag.index("./photos/")
|
|
12
|
+
results = rag.query("find empty shelf")
|
|
13
|
+
|
|
14
|
+
REST API:
|
|
15
|
+
from sightrag import serve
|
|
16
|
+
serve(port=8000)
|
|
17
|
+
"""
|
|
18
|
+
|
|
19
|
+
from .core import SightRAG
|
|
20
|
+
|
|
21
|
+
__version__ = "0.1.0"
|
|
22
|
+
__author__ = "Ant (VK-Ant)"
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
def serve(host: str = "0.0.0.0", port: int = 8000):
|
|
26
|
+
"""Start SightRAG REST API server."""
|
|
27
|
+
from .api import serve as _serve
|
|
28
|
+
_serve(host=host, port=port)
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
__all__ = ["SightRAG", "serve"]
|
sightrag/api.py
ADDED
|
@@ -0,0 +1,189 @@
|
|
|
1
|
+
# sightrag/api.py
|
|
2
|
+
# REST API — FastAPI based
|
|
3
|
+
# Run: sightrag-server or python -m sightrag.api
|
|
4
|
+
|
|
5
|
+
import os
|
|
6
|
+
import json
|
|
7
|
+
import shutil
|
|
8
|
+
import tempfile
|
|
9
|
+
from pathlib import Path
|
|
10
|
+
|
|
11
|
+
try:
|
|
12
|
+
from fastapi import FastAPI, UploadFile, File, Form, HTTPException
|
|
13
|
+
from fastapi.responses import JSONResponse
|
|
14
|
+
import uvicorn
|
|
15
|
+
except ImportError:
|
|
16
|
+
raise ImportError(
|
|
17
|
+
"FastAPI not installed.\n"
|
|
18
|
+
"Run: pip install sightrag[api]"
|
|
19
|
+
)
|
|
20
|
+
|
|
21
|
+
from .core import SightRAG
|
|
22
|
+
|
|
23
|
+
app = FastAPI(
|
|
24
|
+
title="SightRAG API",
|
|
25
|
+
description="See. Search. Retrieve. — Image and Video RAG",
|
|
26
|
+
version="0.1.0"
|
|
27
|
+
)
|
|
28
|
+
|
|
29
|
+
# Global SightRAG instance
|
|
30
|
+
rag = None
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
def get_rag():
|
|
34
|
+
global rag
|
|
35
|
+
if rag is None:
|
|
36
|
+
store = os.getenv("SIGHTRAG_STORE", "sqlite")
|
|
37
|
+
domain = os.getenv("SIGHTRAG_DOMAIN", None)
|
|
38
|
+
index_path = os.getenv("SIGHTRAG_INDEX", "./sightrag_index")
|
|
39
|
+
rag = SightRAG(store=store, domain_hint=domain, index_path=index_path)
|
|
40
|
+
return rag
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
@app.get("/")
|
|
44
|
+
def root():
|
|
45
|
+
return {
|
|
46
|
+
"name": "SightRAG API",
|
|
47
|
+
"version": "0.1.0",
|
|
48
|
+
"tagline": "See. Search. Retrieve.",
|
|
49
|
+
"endpoints": {
|
|
50
|
+
"POST /index/folder": "Index an image folder",
|
|
51
|
+
"POST /index/video": "Index a video file",
|
|
52
|
+
"POST /index/upload": "Upload and index images",
|
|
53
|
+
"POST /query/text": "Search with text",
|
|
54
|
+
"POST /query/reference": "Search with reference image",
|
|
55
|
+
"GET /status": "Index status",
|
|
56
|
+
"DELETE /index": "Clear index"
|
|
57
|
+
}
|
|
58
|
+
}
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+
@app.get("/status")
|
|
62
|
+
def status():
|
|
63
|
+
r = get_rag()
|
|
64
|
+
return {
|
|
65
|
+
"indexed_regions": r.count(),
|
|
66
|
+
"store": r._store_type,
|
|
67
|
+
"domain_hint": r.domain_hint
|
|
68
|
+
}
|
|
69
|
+
|
|
70
|
+
|
|
71
|
+
@app.post("/index/folder")
|
|
72
|
+
def index_folder(path: str = Form(...)):
|
|
73
|
+
"""Index all images in a folder."""
|
|
74
|
+
r = get_rag()
|
|
75
|
+
try:
|
|
76
|
+
r.index(path)
|
|
77
|
+
return {
|
|
78
|
+
"status": "success",
|
|
79
|
+
"indexed_regions": r.count(),
|
|
80
|
+
"source": path
|
|
81
|
+
}
|
|
82
|
+
except Exception as e:
|
|
83
|
+
raise HTTPException(status_code=400, detail=str(e))
|
|
84
|
+
|
|
85
|
+
|
|
86
|
+
@app.post("/index/video")
|
|
87
|
+
def index_video(path: str = Form(...), fps: int = Form(1)):
|
|
88
|
+
"""Index a video file."""
|
|
89
|
+
r = get_rag()
|
|
90
|
+
try:
|
|
91
|
+
r.index(path, fps=fps)
|
|
92
|
+
return {
|
|
93
|
+
"status": "success",
|
|
94
|
+
"indexed_regions": r.count(),
|
|
95
|
+
"source": path,
|
|
96
|
+
"fps": fps
|
|
97
|
+
}
|
|
98
|
+
except Exception as e:
|
|
99
|
+
raise HTTPException(status_code=400, detail=str(e))
|
|
100
|
+
|
|
101
|
+
|
|
102
|
+
@app.post("/index/upload")
|
|
103
|
+
async def index_upload(files: list[UploadFile] = File(...)):
|
|
104
|
+
"""Upload and index images directly."""
|
|
105
|
+
r = get_rag()
|
|
106
|
+
upload_dir = tempfile.mkdtemp(prefix="sightrag_upload_")
|
|
107
|
+
|
|
108
|
+
try:
|
|
109
|
+
# Save uploaded files
|
|
110
|
+
for f in files:
|
|
111
|
+
file_path = os.path.join(upload_dir, f.filename)
|
|
112
|
+
with open(file_path, "wb") as out:
|
|
113
|
+
content = await f.read()
|
|
114
|
+
out.write(content)
|
|
115
|
+
|
|
116
|
+
# Index the upload folder
|
|
117
|
+
r.index(upload_dir)
|
|
118
|
+
|
|
119
|
+
return {
|
|
120
|
+
"status": "success",
|
|
121
|
+
"files_uploaded": len(files),
|
|
122
|
+
"indexed_regions": r.count()
|
|
123
|
+
}
|
|
124
|
+
except Exception as e:
|
|
125
|
+
raise HTTPException(status_code=400, detail=str(e))
|
|
126
|
+
finally:
|
|
127
|
+
shutil.rmtree(upload_dir, ignore_errors=True)
|
|
128
|
+
|
|
129
|
+
|
|
130
|
+
@app.post("/query/text")
|
|
131
|
+
def query_text(text: str = Form(...), top_k: int = Form(5)):
|
|
132
|
+
"""Search with plain English text."""
|
|
133
|
+
r = get_rag()
|
|
134
|
+
try:
|
|
135
|
+
results = r.query(text=text, top_k=top_k)
|
|
136
|
+
return {
|
|
137
|
+
"query": text,
|
|
138
|
+
"results": results,
|
|
139
|
+
"count": len(results)
|
|
140
|
+
}
|
|
141
|
+
except Exception as e:
|
|
142
|
+
raise HTTPException(status_code=400, detail=str(e))
|
|
143
|
+
|
|
144
|
+
|
|
145
|
+
@app.post("/query/reference")
|
|
146
|
+
async def query_reference(
|
|
147
|
+
file: UploadFile = File(...),
|
|
148
|
+
top_k: int = Form(5)
|
|
149
|
+
):
|
|
150
|
+
"""Search using a reference image."""
|
|
151
|
+
r = get_rag()
|
|
152
|
+
|
|
153
|
+
# Save reference temporarily
|
|
154
|
+
tmp = tempfile.NamedTemporaryFile(
|
|
155
|
+
delete=False, suffix=f"_{file.filename}"
|
|
156
|
+
)
|
|
157
|
+
try:
|
|
158
|
+
content = await file.read()
|
|
159
|
+
tmp.write(content)
|
|
160
|
+
tmp.close()
|
|
161
|
+
|
|
162
|
+
results = r.query(reference=tmp.name, top_k=top_k)
|
|
163
|
+
return {
|
|
164
|
+
"reference": file.filename,
|
|
165
|
+
"results": results,
|
|
166
|
+
"count": len(results)
|
|
167
|
+
}
|
|
168
|
+
except Exception as e:
|
|
169
|
+
raise HTTPException(status_code=400, detail=str(e))
|
|
170
|
+
finally:
|
|
171
|
+
os.unlink(tmp.name)
|
|
172
|
+
|
|
173
|
+
|
|
174
|
+
@app.delete("/index")
|
|
175
|
+
def clear_index():
|
|
176
|
+
"""Clear all indexed data."""
|
|
177
|
+
r = get_rag()
|
|
178
|
+
r.clear()
|
|
179
|
+
return {"status": "cleared", "indexed_regions": 0}
|
|
180
|
+
|
|
181
|
+
|
|
182
|
+
def serve(host: str = "0.0.0.0", port: int = 8000):
|
|
183
|
+
"""Start the SightRAG API server."""
|
|
184
|
+
print(f"[SightRAG] Starting API server on {host}:{port}")
|
|
185
|
+
uvicorn.run(app, host=host, port=port)
|
|
186
|
+
|
|
187
|
+
|
|
188
|
+
if __name__ == "__main__":
|
|
189
|
+
serve()
|
sightrag/core.py
ADDED
|
@@ -0,0 +1,129 @@
|
|
|
1
|
+
# sightrag/core.py
|
|
2
|
+
# Main SightRAG class
|
|
3
|
+
# All data stored in ~/.sightrag/ — project folder stays clean
|
|
4
|
+
|
|
5
|
+
import os
|
|
6
|
+
from pathlib import Path
|
|
7
|
+
from .detector import Detector
|
|
8
|
+
from .embedder import Embedder
|
|
9
|
+
from .indexer import Indexer
|
|
10
|
+
from .retriever import Retriever
|
|
11
|
+
|
|
12
|
+
SIGHTRAG_HOME = os.path.join(Path.home(), ".sightrag")
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
class SightRAG:
|
|
16
|
+
"""
|
|
17
|
+
SightRAG — Image and Video RAG.
|
|
18
|
+
See. Search. Retrieve.
|
|
19
|
+
|
|
20
|
+
Usage:
|
|
21
|
+
rag = SightRAG()
|
|
22
|
+
rag.index("./photos/")
|
|
23
|
+
results = rag.query("find empty shelf")
|
|
24
|
+
"""
|
|
25
|
+
|
|
26
|
+
def __init__(self,
|
|
27
|
+
store: str = "sqlite",
|
|
28
|
+
domain_hint: str = None,
|
|
29
|
+
index_path: str = None):
|
|
30
|
+
|
|
31
|
+
self.domain_hint = domain_hint
|
|
32
|
+
self._store_type = store
|
|
33
|
+
|
|
34
|
+
if index_path is None:
|
|
35
|
+
self._index_path = os.path.join(SIGHTRAG_HOME, "index")
|
|
36
|
+
else:
|
|
37
|
+
self._index_path = index_path
|
|
38
|
+
|
|
39
|
+
os.makedirs(SIGHTRAG_HOME, exist_ok=True)
|
|
40
|
+
|
|
41
|
+
print("[SightRAG] Initializing...")
|
|
42
|
+
self._detector = Detector()
|
|
43
|
+
self._embedder = Embedder()
|
|
44
|
+
self._store = self._init_store(store, self._index_path)
|
|
45
|
+
self._indexer = Indexer(
|
|
46
|
+
self._detector, self._embedder,
|
|
47
|
+
self._store, domain_hint
|
|
48
|
+
)
|
|
49
|
+
self._retriever = Retriever(
|
|
50
|
+
self._embedder, self._detector,
|
|
51
|
+
self._store, domain_hint
|
|
52
|
+
)
|
|
53
|
+
print("[SightRAG] Ready.")
|
|
54
|
+
|
|
55
|
+
def _init_store(self, store_type: str, path: str):
|
|
56
|
+
if store_type == "chroma":
|
|
57
|
+
try:
|
|
58
|
+
from .store.chroma_store import ChromaStore
|
|
59
|
+
return ChromaStore(path)
|
|
60
|
+
except ImportError:
|
|
61
|
+
print("[SightRAG] ChromaDB not found. Using SQLite.")
|
|
62
|
+
from .store.sqlite_store import SQLiteStore
|
|
63
|
+
return SQLiteStore(path)
|
|
64
|
+
elif store_type == "sqlite":
|
|
65
|
+
from .store.sqlite_store import SQLiteStore
|
|
66
|
+
return SQLiteStore(path)
|
|
67
|
+
else:
|
|
68
|
+
raise ValueError(f"Unknown store: {store_type}. Use 'chroma' or 'sqlite'.")
|
|
69
|
+
|
|
70
|
+
def index(self, path: str = None, source: str = None,
|
|
71
|
+
camera_id: int = 0, fps: int = 1):
|
|
72
|
+
"""Index images, video, or camera."""
|
|
73
|
+
if source == "camera":
|
|
74
|
+
self._indexer.index_camera(camera_id=camera_id, fps=fps)
|
|
75
|
+
return self
|
|
76
|
+
|
|
77
|
+
if path is None:
|
|
78
|
+
raise ValueError("Provide a path or source='camera'")
|
|
79
|
+
|
|
80
|
+
if os.path.isdir(path):
|
|
81
|
+
self._indexer.index_folder(path, fps=fps)
|
|
82
|
+
elif os.path.isfile(path):
|
|
83
|
+
ext = os.path.splitext(path)[1].lower()
|
|
84
|
+
if ext in {".mp4", ".avi", ".mov", ".mkv"}:
|
|
85
|
+
self._indexer.index_video(path, fps=fps)
|
|
86
|
+
else:
|
|
87
|
+
from .utils.image import load_image
|
|
88
|
+
image = load_image(path)
|
|
89
|
+
self._index_single_image(path, image)
|
|
90
|
+
print(f"[SightRAG] 1 image indexed. Total: {self.count()} regions.")
|
|
91
|
+
else:
|
|
92
|
+
raise FileNotFoundError(f"Path not found: {path}")
|
|
93
|
+
|
|
94
|
+
return self
|
|
95
|
+
|
|
96
|
+
def _index_single_image(self, path, image):
|
|
97
|
+
"""Index one image with detection + embedding."""
|
|
98
|
+
import numpy as np
|
|
99
|
+
regions = self._detector.detect(image)
|
|
100
|
+
for j, region in enumerate(regions):
|
|
101
|
+
embedding = self._embedder.embed_image(region["crop"])
|
|
102
|
+
if not np.allclose(embedding, 0):
|
|
103
|
+
self._store.add(f"img_{j}", embedding, {
|
|
104
|
+
"image_path": str(path),
|
|
105
|
+
"bbox": region["bbox"],
|
|
106
|
+
"label": region["label"],
|
|
107
|
+
"confidence": region["confidence"],
|
|
108
|
+
"source_type": "image"
|
|
109
|
+
})
|
|
110
|
+
|
|
111
|
+
def query(self, text: str = None, reference: str = None, top_k: int = 5):
|
|
112
|
+
"""Search indexed content with text or reference image."""
|
|
113
|
+
if text is None and reference is None:
|
|
114
|
+
raise ValueError("Provide text or reference image.")
|
|
115
|
+
if text:
|
|
116
|
+
return self._retriever.query_text(text, top_k)
|
|
117
|
+
else:
|
|
118
|
+
return self._retriever.query_reference(reference, top_k)
|
|
119
|
+
|
|
120
|
+
def count(self) -> int:
|
|
121
|
+
return self._store.count()
|
|
122
|
+
|
|
123
|
+
def clear(self):
|
|
124
|
+
self._store.clear()
|
|
125
|
+
print("[SightRAG] Index cleared.")
|
|
126
|
+
return self
|
|
127
|
+
|
|
128
|
+
def __repr__(self):
|
|
129
|
+
return f"SightRAG(store='{self._store_type}', indexed={self.count()} regions)"
|
sightrag/detector.py
ADDED
|
@@ -0,0 +1,81 @@
|
|
|
1
|
+
# sightrag/detector.py
|
|
2
|
+
# YOLO detection — models stored in ~/.sightrag/models/
|
|
3
|
+
|
|
4
|
+
import os
|
|
5
|
+
import numpy as np
|
|
6
|
+
from PIL import Image
|
|
7
|
+
from pathlib import Path
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
MODEL_DIR = os.path.join(Path.home(), ".sightrag", "models")
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
class Detector:
|
|
14
|
+
"""YOLO object detector with whole-image fallback."""
|
|
15
|
+
|
|
16
|
+
def __init__(self, model_size: str = "yolo11n.pt", model_dir: str = None):
|
|
17
|
+
self.model = None
|
|
18
|
+
self.model_size = model_size
|
|
19
|
+
self.model_dir = model_dir or MODEL_DIR
|
|
20
|
+
os.makedirs(self.model_dir, exist_ok=True)
|
|
21
|
+
self._load()
|
|
22
|
+
|
|
23
|
+
def _load(self):
|
|
24
|
+
try:
|
|
25
|
+
from ultralytics import YOLO
|
|
26
|
+
import logging
|
|
27
|
+
logging.getLogger("ultralytics").setLevel(logging.WARNING)
|
|
28
|
+
|
|
29
|
+
model_path = os.path.join(self.model_dir, self.model_size)
|
|
30
|
+
|
|
31
|
+
if os.path.exists(model_path):
|
|
32
|
+
self.model = YOLO(model_path)
|
|
33
|
+
else:
|
|
34
|
+
# Download and move to our folder
|
|
35
|
+
self.model = YOLO(self.model_size)
|
|
36
|
+
# Move .pt file from current dir to model_dir
|
|
37
|
+
cwd_model = os.path.join(os.getcwd(), self.model_size)
|
|
38
|
+
if os.path.exists(cwd_model) and cwd_model != model_path:
|
|
39
|
+
import shutil
|
|
40
|
+
shutil.move(cwd_model, model_path)
|
|
41
|
+
|
|
42
|
+
except Exception as e:
|
|
43
|
+
print(f"[SightRAG] YOLO not available: {str(e)[:100]}")
|
|
44
|
+
self.model = None
|
|
45
|
+
|
|
46
|
+
def detect(self, image: Image.Image, confidence: float = 0.25):
|
|
47
|
+
"""Detect objects. Always returns at least whole image."""
|
|
48
|
+
regions = []
|
|
49
|
+
|
|
50
|
+
if self.model is not None:
|
|
51
|
+
try:
|
|
52
|
+
results = self.model(image, conf=confidence, verbose=False)
|
|
53
|
+
for result in results:
|
|
54
|
+
if result.boxes is None or len(result.boxes) == 0:
|
|
55
|
+
continue
|
|
56
|
+
for box in result.boxes:
|
|
57
|
+
x1, y1, x2, y2 = map(int, box.xyxy[0].tolist())
|
|
58
|
+
w, h = image.size
|
|
59
|
+
x1, y1 = max(0, x1), max(0, y1)
|
|
60
|
+
x2, y2 = min(w, x2), min(h, y2)
|
|
61
|
+
if (x2 - x1) < 10 or (y2 - y1) < 10:
|
|
62
|
+
continue
|
|
63
|
+
regions.append({
|
|
64
|
+
"crop": image.crop((x1, y1, x2, y2)),
|
|
65
|
+
"bbox": [x1, y1, x2, y2],
|
|
66
|
+
"label": result.names[int(box.cls[0])],
|
|
67
|
+
"confidence": float(box.conf[0])
|
|
68
|
+
})
|
|
69
|
+
except:
|
|
70
|
+
pass
|
|
71
|
+
|
|
72
|
+
# Always add whole image
|
|
73
|
+
w, h = image.size
|
|
74
|
+
regions.append({
|
|
75
|
+
"crop": image,
|
|
76
|
+
"bbox": [0, 0, w, h],
|
|
77
|
+
"label": "whole_image",
|
|
78
|
+
"confidence": 1.0
|
|
79
|
+
})
|
|
80
|
+
|
|
81
|
+
return regions
|
sightrag/embedder.py
ADDED
|
@@ -0,0 +1,83 @@
|
|
|
1
|
+
# sightrag/embedder.py
|
|
2
|
+
# CLIP embedder — works across all transformers versions
|
|
3
|
+
|
|
4
|
+
import os
|
|
5
|
+
import numpy as np
|
|
6
|
+
from PIL import Image
|
|
7
|
+
|
|
8
|
+
os.environ["TRANSFORMERS_VERBOSITY"] = "error"
|
|
9
|
+
os.environ["TOKENIZERS_PARALLELISM"] = "false"
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
class Embedder:
|
|
13
|
+
"""CLIP embedder for images and text queries."""
|
|
14
|
+
|
|
15
|
+
def __init__(self, model_name: str = "openai/clip-vit-base-patch32"):
|
|
16
|
+
self.model = None
|
|
17
|
+
self.processor = None
|
|
18
|
+
self.embed_dim = None
|
|
19
|
+
self._load(model_name)
|
|
20
|
+
|
|
21
|
+
def _load(self, model_name):
|
|
22
|
+
import warnings
|
|
23
|
+
warnings.filterwarnings("ignore")
|
|
24
|
+
|
|
25
|
+
from transformers import CLIPModel, CLIPProcessor
|
|
26
|
+
self.model = CLIPModel.from_pretrained(model_name)
|
|
27
|
+
self.processor = CLIPProcessor.from_pretrained(model_name)
|
|
28
|
+
self.model.eval()
|
|
29
|
+
self.embed_dim = self.model.config.projection_dim
|
|
30
|
+
|
|
31
|
+
def embed_image(self, image: Image.Image) -> np.ndarray:
|
|
32
|
+
"""Embed image → fixed-size normalized vector."""
|
|
33
|
+
import torch
|
|
34
|
+
try:
|
|
35
|
+
if image.mode != "RGB":
|
|
36
|
+
image = image.convert("RGB")
|
|
37
|
+
|
|
38
|
+
# Get pixel values
|
|
39
|
+
inputs = self.processor(images=image, return_tensors="pt", padding=True)
|
|
40
|
+
pixel_values = inputs["pixel_values"]
|
|
41
|
+
|
|
42
|
+
with torch.no_grad():
|
|
43
|
+
# Use vision model + projection explicitly
|
|
44
|
+
# This guarantees correct output dim across all versions
|
|
45
|
+
vision_out = self.model.vision_model(pixel_values=pixel_values)
|
|
46
|
+
pooled = vision_out.pooler_output # (1, hidden_dim)
|
|
47
|
+
projected = self.model.visual_projection(pooled) # (1, projection_dim)
|
|
48
|
+
|
|
49
|
+
emb = projected[0].detach().cpu().numpy().astype(np.float32)
|
|
50
|
+
norm = np.linalg.norm(emb)
|
|
51
|
+
return emb / norm if norm > 0 else emb
|
|
52
|
+
|
|
53
|
+
except Exception as e:
|
|
54
|
+
print(f"[SightRAG] Image embed error: {e}")
|
|
55
|
+
return np.zeros(self.embed_dim, dtype=np.float32)
|
|
56
|
+
|
|
57
|
+
def embed_text(self, text: str, domain_hint: str = None) -> np.ndarray:
|
|
58
|
+
"""Embed text query → fixed-size normalized vector."""
|
|
59
|
+
import torch
|
|
60
|
+
try:
|
|
61
|
+
query = f"{text} {domain_hint}" if domain_hint else text
|
|
62
|
+
|
|
63
|
+
inputs = self.processor(
|
|
64
|
+
text=[query], return_tensors="pt",
|
|
65
|
+
padding=True, truncation=True
|
|
66
|
+
)
|
|
67
|
+
|
|
68
|
+
with torch.no_grad():
|
|
69
|
+
# Use text model + projection explicitly
|
|
70
|
+
text_out = self.model.text_model(
|
|
71
|
+
input_ids=inputs["input_ids"],
|
|
72
|
+
attention_mask=inputs["attention_mask"]
|
|
73
|
+
)
|
|
74
|
+
pooled = text_out.pooler_output # (1, hidden_dim)
|
|
75
|
+
projected = self.model.text_projection(pooled) # (1, projection_dim)
|
|
76
|
+
|
|
77
|
+
emb = projected[0].detach().cpu().numpy().astype(np.float32)
|
|
78
|
+
norm = np.linalg.norm(emb)
|
|
79
|
+
return emb / norm if norm > 0 else emb
|
|
80
|
+
|
|
81
|
+
except Exception as e:
|
|
82
|
+
print(f"[SightRAG] Text embed error: {e}")
|
|
83
|
+
return np.zeros(self.embed_dim, dtype=np.float32)
|
sightrag/indexer.py
ADDED
|
@@ -0,0 +1,147 @@
|
|
|
1
|
+
# sightrag/indexer.py
|
|
2
|
+
# Image, video, camera indexing — clean output only
|
|
3
|
+
|
|
4
|
+
import os
|
|
5
|
+
import numpy as np
|
|
6
|
+
from pathlib import Path
|
|
7
|
+
from .detector import Detector
|
|
8
|
+
from .embedder import Embedder
|
|
9
|
+
from .utils.image import load_image, SUPPORTED_FORMATS as IMAGE_FORMATS
|
|
10
|
+
from .utils.video import extract_frames, SUPPORTED_FORMATS as VIDEO_FORMATS
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
class Indexer:
|
|
14
|
+
"""Indexes images, videos, and camera frames."""
|
|
15
|
+
|
|
16
|
+
def __init__(self, detector, embedder, store, domain_hint=None):
|
|
17
|
+
self.detector = detector
|
|
18
|
+
self.embedder = embedder
|
|
19
|
+
self.store = store
|
|
20
|
+
self.domain_hint = domain_hint
|
|
21
|
+
|
|
22
|
+
def _index_image(self, path_str, image, prefix):
|
|
23
|
+
"""Index one image — detect regions, embed, store."""
|
|
24
|
+
regions = self.detector.detect(image)
|
|
25
|
+
count = 0
|
|
26
|
+
for j, region in enumerate(regions):
|
|
27
|
+
embedding = self.embedder.embed_image(region["crop"])
|
|
28
|
+
if not np.allclose(embedding, 0):
|
|
29
|
+
self.store.add(f"{prefix}_{j}", embedding, {
|
|
30
|
+
"image_path": path_str,
|
|
31
|
+
"bbox": region["bbox"],
|
|
32
|
+
"label": region["label"],
|
|
33
|
+
"confidence": region["confidence"],
|
|
34
|
+
"source_type": "image"
|
|
35
|
+
})
|
|
36
|
+
count += 1
|
|
37
|
+
return count
|
|
38
|
+
|
|
39
|
+
def index_folder(self, folder_path: str, fps: int = 1):
|
|
40
|
+
"""Index all images AND videos in a folder."""
|
|
41
|
+
folder = Path(folder_path)
|
|
42
|
+
if not folder.exists():
|
|
43
|
+
raise FileNotFoundError(f"Folder not found: {folder}")
|
|
44
|
+
if not folder.is_dir():
|
|
45
|
+
raise ValueError(f"Not a folder: {folder}")
|
|
46
|
+
|
|
47
|
+
# Find images
|
|
48
|
+
image_paths = []
|
|
49
|
+
for fmt in IMAGE_FORMATS:
|
|
50
|
+
image_paths.extend(folder.glob(f"*{fmt}"))
|
|
51
|
+
image_paths.extend(folder.glob(f"*{fmt.upper()}"))
|
|
52
|
+
image_paths = sorted(set(image_paths))
|
|
53
|
+
|
|
54
|
+
# Find videos
|
|
55
|
+
video_paths = []
|
|
56
|
+
for fmt in VIDEO_FORMATS:
|
|
57
|
+
video_paths.extend(folder.glob(f"*{fmt}"))
|
|
58
|
+
video_paths.extend(folder.glob(f"*{fmt.upper()}"))
|
|
59
|
+
video_paths = sorted(set(video_paths))
|
|
60
|
+
|
|
61
|
+
if not image_paths and not video_paths:
|
|
62
|
+
raise ValueError(f"No images or videos in {folder}")
|
|
63
|
+
|
|
64
|
+
print(f"[SightRAG] Found {len(image_paths)} images, {len(video_paths)} videos")
|
|
65
|
+
|
|
66
|
+
# Index images
|
|
67
|
+
if image_paths:
|
|
68
|
+
total = len(image_paths)
|
|
69
|
+
for i, path in enumerate(image_paths, 1):
|
|
70
|
+
try:
|
|
71
|
+
image = load_image(str(path))
|
|
72
|
+
self._index_image(str(path), image, path.stem)
|
|
73
|
+
pct = int((i / total) * 40)
|
|
74
|
+
bar = "█" * pct + "░" * (40 - pct)
|
|
75
|
+
print(f"\r [{bar}] {i}/{total} images", end="", flush=True)
|
|
76
|
+
except Exception as e:
|
|
77
|
+
print(f"\n Skipping {path.name}: {e}")
|
|
78
|
+
print()
|
|
79
|
+
|
|
80
|
+
# Index videos
|
|
81
|
+
if video_paths:
|
|
82
|
+
for v_idx, vpath in enumerate(video_paths, 1):
|
|
83
|
+
try:
|
|
84
|
+
print(f"[SightRAG] Video {v_idx}/{len(video_paths)}: {vpath.name}")
|
|
85
|
+
self._index_video(str(vpath), fps)
|
|
86
|
+
except Exception as e:
|
|
87
|
+
print(f" Skipping {vpath.name}: {e}")
|
|
88
|
+
|
|
89
|
+
print(f"[SightRAG] Done. {self.store.count()} regions indexed.")
|
|
90
|
+
|
|
91
|
+
def index_video(self, video_path: str, fps: int = 1):
|
|
92
|
+
"""Index a single video file."""
|
|
93
|
+
self._index_video(video_path, fps)
|
|
94
|
+
print(f"[SightRAG] Done. {self.store.count()} regions indexed.")
|
|
95
|
+
|
|
96
|
+
def _index_video(self, video_path: str, fps: int = 1):
|
|
97
|
+
"""Internal video indexing."""
|
|
98
|
+
video_name = Path(video_path).stem
|
|
99
|
+
frames = extract_frames(video_path, fps=fps)
|
|
100
|
+
total = len(frames)
|
|
101
|
+
print(f" {total} frames extracted...")
|
|
102
|
+
|
|
103
|
+
for i, (image, timestamp) in enumerate(frames, 1):
|
|
104
|
+
try:
|
|
105
|
+
regions = self.detector.detect(image)
|
|
106
|
+
for j, region in enumerate(regions):
|
|
107
|
+
embedding = self.embedder.embed_image(region["crop"])
|
|
108
|
+
if not np.allclose(embedding, 0):
|
|
109
|
+
self.store.add(f"{video_name}_f{i}_r{j}", embedding, {
|
|
110
|
+
"image_path": video_path,
|
|
111
|
+
"bbox": region["bbox"],
|
|
112
|
+
"label": region["label"],
|
|
113
|
+
"confidence": region["confidence"],
|
|
114
|
+
"timestamp": timestamp,
|
|
115
|
+
"source_type": "video"
|
|
116
|
+
})
|
|
117
|
+
pct = int((i / total) * 40)
|
|
118
|
+
bar = "█" * pct + "░" * (40 - pct)
|
|
119
|
+
print(f"\r [{bar}] {i}/{total} frames", end="", flush=True)
|
|
120
|
+
except:
|
|
121
|
+
pass
|
|
122
|
+
print()
|
|
123
|
+
|
|
124
|
+
def index_camera(self, camera_id=0, fps=1, buffer_seconds=60):
|
|
125
|
+
"""Index live camera frames."""
|
|
126
|
+
from .utils.camera import capture_frames
|
|
127
|
+
print(f"[SightRAG] Camera {camera_id}. Press Ctrl+C to stop.")
|
|
128
|
+
|
|
129
|
+
count = 0
|
|
130
|
+
try:
|
|
131
|
+
for image, timestamp in capture_frames(camera_id, fps, buffer_seconds):
|
|
132
|
+
regions = self.detector.detect(image)
|
|
133
|
+
for j, region in enumerate(regions):
|
|
134
|
+
embedding = self.embedder.embed_image(region["crop"])
|
|
135
|
+
if not np.allclose(embedding, 0):
|
|
136
|
+
self.store.add(f"cam{camera_id}_{timestamp}_{j}", embedding, {
|
|
137
|
+
"image_path": f"camera_{camera_id}",
|
|
138
|
+
"bbox": region["bbox"],
|
|
139
|
+
"label": region["label"],
|
|
140
|
+
"confidence": region["confidence"],
|
|
141
|
+
"timestamp": timestamp,
|
|
142
|
+
"source_type": "camera"
|
|
143
|
+
})
|
|
144
|
+
count += 1
|
|
145
|
+
print(f"\r[SightRAG] {count} frames | {timestamp}", end="", flush=True)
|
|
146
|
+
except KeyboardInterrupt:
|
|
147
|
+
print(f"\n[SightRAG] Stopped. {self.store.count()} regions indexed.")
|