@blueharford/scrypted-spatial-awareness 0.4.8-beta.1 → 0.5.0-beta

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md CHANGED
@@ -51,6 +51,7 @@ Done! Your camera topology is configured.
51
51
  ### Visual Editor
52
52
  - **Floor Plan** - Upload image or draw with built-in tools
53
53
  - **Drag & Drop** - Place cameras, draw connections
54
+ - **Polygon Zone Drawing** - Draw custom zones (yards, driveways, patios, etc.)
54
55
  - **Live Tracking** - Watch objects move in real-time
55
56
 
56
57
  ### AI Features (optional)
@@ -58,6 +59,7 @@ Done! Your camera topology is configured.
58
59
  - **Auto-Learning** - Transit times adjust based on observations
59
60
  - **Connection Suggestions** - System suggests new camera paths
60
61
  - **Landmark Discovery** - AI identifies landmarks from footage
62
+ - **Auto-Topology Discovery** - Vision LLM analyzes camera views to build topology
61
63
 
62
64
  ### Integrations
63
65
  - **MQTT** - Home Assistant integration
@@ -170,6 +172,98 @@ Base URL: `/endpoint/@blueharford/scrypted-spatial-awareness`
170
172
  | `/api/training/apply` | POST | Apply results to topology |
171
173
  | `/api/training/status` | GET | Current training status |
172
174
 
175
+ ### Discovery API
176
+
177
+ | Endpoint | Method | Description |
178
+ |----------|--------|-------------|
179
+ | `/api/discovery/scan` | POST | Run full discovery scan |
180
+ | `/api/discovery/status` | GET | Current discovery status |
181
+ | `/api/discovery/suggestions` | GET | Pending suggestions |
182
+ | `/api/discovery/camera/{id}` | GET | Analyze single camera |
183
+
184
+ ## Auto-Topology Discovery
185
+
186
+ The plugin can automatically analyze camera views using a vision-capable LLM to discover landmarks, zones, and camera connections.
187
+
188
+ ### How It Works
189
+
190
+ 1. **Capture Snapshots** - System takes a picture from each camera
191
+ 2. **Scene Analysis** - Vision LLM identifies landmarks, zones, and edges in each view
192
+ 3. **Cross-Camera Correlation** - LLM correlates findings across cameras to identify shared landmarks and connections
193
+ 4. **Suggestions** - Discoveries are presented as suggestions you can accept or reject
194
+
195
+ ### Using Discovery
196
+
197
+ **Manual Scan:**
198
+ 1. Open the topology editor (`/ui/editor`)
199
+ 2. Find the "Auto-Discovery" section in the sidebar
200
+ 3. Click "Scan Now"
201
+ 4. Review and accept/reject suggestions
202
+
203
+ **Automatic Scan:**
204
+ - Set `Auto-Discovery Interval (hours)` in plugin settings
205
+ - System will periodically scan and generate suggestions
206
+ - Set to 0 to disable automatic scanning
207
+
208
+ ### Discovery Settings
209
+
210
+ | Setting | Default | Description |
211
+ |---------|---------|-------------|
212
+ | Auto-Discovery Interval | 0 (disabled) | Hours between automatic scans (0 = disabled) |
213
+ | Min Landmark Confidence | 0.6 | Minimum confidence for landmark suggestions |
214
+ | Min Connection Confidence | 0.5 | Minimum confidence for connection suggestions |
215
+ | Auto-Accept Threshold | 0.85 | Auto-accept suggestions above this confidence |
216
+
217
+ > **Rate Limiting Note:** If you set the interval to less than 1 hour, a warning will appear in the discovery status. Frequent scans can consume significant LLM API quota and may be rate-limited by your provider.
218
+
219
+ ### Requirements
220
+
221
+ - **Vision-capable LLM** - Install @scrypted/llm with a vision model (OpenAI GPT-4V, Claude, etc.)
222
+ - **Camera access** - Plugin needs camera.takePicture() capability
223
+
224
+ ### What Gets Discovered
225
+
226
+ - **Landmarks**: Doors, gates, mailbox, garage, structures, fences
227
+ - **Zones**: Front yard, driveway, patio, street, walkways
228
+ - **Connections**: Suggested camera paths with transit time estimates
229
+ - **Edges**: What's visible at frame boundaries (for correlation)
230
+
231
+ ## Zone Drawing
232
+
233
+ The visual editor includes a polygon zone drawing tool for marking areas on your floor plan.
234
+
235
+ ### How to Draw Zones
236
+
237
+ 1. Click the **Draw Zone** button in the toolbar (green)
238
+ 2. Enter a zone name and select the type (yard, driveway, patio, etc.)
239
+ 3. Click **Start Drawing**
240
+ 4. Click on the canvas to add polygon points
241
+ 5. **Double-click** or press **Enter** to finish the zone
242
+ 6. Press **Escape** to cancel, **Backspace** to undo last point
243
+
244
+ ### Zone Types
245
+
246
+ | Type | Color | Description |
247
+ |------|-------|-------------|
248
+ | Yard | Green | Front yard, backyard, side yard |
249
+ | Driveway | Gray | Driveway, parking area |
250
+ | Street | Dark Gray | Street, sidewalk |
251
+ | Patio | Orange | Patio, deck |
252
+ | Walkway | Brown | Walkways, paths |
253
+ | Parking | Light Gray | Parking lot, parking space |
254
+ | Garden | Light Green | Garden, landscaped area |
255
+ | Pool | Blue | Pool area |
256
+ | Garage | Medium Gray | Garage area |
257
+ | Entrance | Pink | Entry areas |
258
+ | Custom | Purple | Custom zone type |
259
+
260
+ ### Using Zones
261
+
262
+ - Click on a zone to select it and edit its properties
263
+ - Zones are color-coded by type for easy identification
264
+ - Zones help provide context for object movement descriptions
265
+ - Auto-Discovery can suggest zones based on camera analysis
266
+
173
267
  ## MQTT Topics
174
268
 
175
269
  Base: `scrypted/spatial-awareness`
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@blueharford/scrypted-spatial-awareness",
3
- "version": "0.4.8-beta.1",
3
+ "version": "0.5.0-beta",
4
4
  "description": "Cross-camera object tracking for Scrypted NVR with spatial awareness",
5
5
  "author": "Joshua Seidel <blueharford>",
6
6
  "license": "Apache-2.0",
@@ -10,6 +10,7 @@ import sdk, {
10
10
  Camera,
11
11
  MediaObject,
12
12
  ScryptedDevice,
13
+ ScryptedMimeTypes,
13
14
  } from '@scrypted/sdk';
14
15
  import {
15
16
  CameraTopology,
@@ -26,7 +27,7 @@ import {
26
27
  } from '../models/topology';
27
28
  import { TrackedObject, ObjectSighting } from '../models/tracked-object';
28
29
 
29
- const { systemManager } = sdk;
30
+ const { systemManager, mediaManager } = sdk;
30
31
 
31
32
  /** Configuration for the spatial reasoning engine */
32
33
  export interface SpatialReasoningConfig {
@@ -68,6 +69,29 @@ interface ChatCompletionDevice extends ScryptedDevice {
68
69
  streamChatCompletion?(params: any): AsyncGenerator<any>;
69
70
  }
70
71
 
72
+ /**
73
+ * Convert a MediaObject to a base64 data URL for vision LLM consumption
74
+ * @param mediaObject - MediaObject from camera.takePicture()
75
+ * @returns Base64 data URL (data:image/jpeg;base64,...) or null if conversion fails
76
+ */
77
+ export async function mediaObjectToBase64(mediaObject: MediaObject): Promise<string | null> {
78
+ try {
79
+ // Convert MediaObject to Buffer using mediaManager
80
+ const buffer = await mediaManager.convertMediaObjectToBuffer(mediaObject, ScryptedMimeTypes.Image);
81
+
82
+ // Convert buffer to base64
83
+ const base64 = buffer.toString('base64');
84
+
85
+ // Determine MIME type - default to JPEG for camera images
86
+ const mimeType = mediaObject.mimeType?.split(';')[0] || 'image/jpeg';
87
+
88
+ return `data:${mimeType};base64,${base64}`;
89
+ } catch (e) {
90
+ console.warn('Failed to convert MediaObject to base64:', e);
91
+ return null;
92
+ }
93
+ }
94
+
71
95
  export class SpatialReasoningEngine {
72
96
  private config: SpatialReasoningConfig;
73
97
  private console: Console;
@@ -712,7 +736,7 @@ export class SpatialReasoningEngine {
712
736
  return connection.name || undefined;
713
737
  }
714
738
 
715
- /** Get LLM-enhanced description using ChatCompletion interface */
739
+ /** Get LLM-enhanced description using ChatCompletion interface with vision support */
716
740
  private async getLlmEnhancedDescription(
717
741
  tracked: TrackedObject,
718
742
  fromCamera: CameraNode,
@@ -726,6 +750,9 @@ export class SpatialReasoningEngine {
726
750
  if (!llm || !llm.getChatCompletion) return null;
727
751
 
728
752
  try {
753
+ // Convert image to base64 for vision LLM
754
+ const imageBase64 = await mediaObjectToBase64(mediaObject);
755
+
729
756
  // Retrieve relevant context for RAG
730
757
  const relevantChunks = this.retrieveRelevantContext(
731
758
  fromCamera.deviceId,
@@ -746,12 +773,25 @@ export class SpatialReasoningEngine {
746
773
  ragContext
747
774
  );
748
775
 
776
+ // Build message content - use multimodal format if we have an image
777
+ let messageContent: any;
778
+ if (imageBase64) {
779
+ // Vision-capable multimodal message format (OpenAI compatible)
780
+ messageContent = [
781
+ { type: 'text', text: prompt },
782
+ { type: 'image_url', image_url: { url: imageBase64 } },
783
+ ];
784
+ } else {
785
+ // Fallback to text-only if image conversion failed
786
+ messageContent = prompt;
787
+ }
788
+
749
789
  // Call LLM using ChatCompletion interface
750
790
  const result = await llm.getChatCompletion({
751
791
  messages: [
752
792
  {
753
793
  role: 'user',
754
- content: prompt,
794
+ content: messageContent,
755
795
  },
756
796
  ],
757
797
  max_tokens: 150,
@@ -809,7 +849,7 @@ Examples of good descriptions:
809
849
  Generate ONLY the description, nothing else:`;
810
850
  }
811
851
 
812
- /** Suggest a new landmark based on AI analysis using ChatCompletion */
852
+ /** Suggest a new landmark based on AI analysis using ChatCompletion with vision */
813
853
  async suggestLandmark(
814
854
  cameraId: string,
815
855
  mediaObject: MediaObject,
@@ -822,6 +862,9 @@ Generate ONLY the description, nothing else:`;
822
862
  if (!llm || !llm.getChatCompletion) return null;
823
863
 
824
864
  try {
865
+ // Convert image to base64 for vision LLM
866
+ const imageBase64 = await mediaObjectToBase64(mediaObject);
867
+
825
868
  const prompt = `Analyze this security camera image. A ${objectClass} was detected.
826
869
 
827
870
  Looking at the surroundings and environment, identify any notable landmarks or features visible that could help describe this location. Consider:
@@ -835,12 +878,25 @@ If you can identify a clear landmark feature, respond with ONLY a JSON object:
835
878
 
836
879
  If no clear landmark is identifiable, respond with: {"name": null}`;
837
880
 
881
+ // Build message content - use multimodal format if we have an image
882
+ let messageContent: any;
883
+ if (imageBase64) {
884
+ // Vision-capable multimodal message format (OpenAI compatible)
885
+ messageContent = [
886
+ { type: 'text', text: prompt },
887
+ { type: 'image_url', image_url: { url: imageBase64 } },
888
+ ];
889
+ } else {
890
+ // Fallback to text-only if image conversion failed
891
+ messageContent = prompt;
892
+ }
893
+
838
894
  // Call LLM using ChatCompletion interface
839
895
  const result = await llm.getChatCompletion({
840
896
  messages: [
841
897
  {
842
898
  role: 'user',
843
- content: prompt,
899
+ content: messageContent,
844
900
  },
845
901
  ],
846
902
  max_tokens: 100,