@blueharford/scrypted-spatial-awareness 0.5.1 → 0.5.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/out/plugin.zip CHANGED
Binary file
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@blueharford/scrypted-spatial-awareness",
3
- "version": "0.5.1",
3
+ "version": "0.5.2",
4
4
  "description": "Cross-camera object tracking for Scrypted NVR with spatial awareness",
5
5
  "author": "Joshua Seidel <blueharford>",
6
6
  "license": "Apache-2.0",
@@ -69,29 +69,54 @@ interface ChatCompletionDevice extends ScryptedDevice {
69
69
  streamChatCompletion?(params: any): AsyncGenerator<any>;
70
70
  }
71
71
 
72
+ /** Image data for LLM vision APIs */
73
+ export interface ImageData {
74
+ /** Raw base64 encoded image data (no data URL prefix) */
75
+ base64: string;
76
+ /** MIME type (e.g., 'image/jpeg') */
77
+ mediaType: string;
78
+ }
79
+
72
80
  /**
73
- * Convert a MediaObject to a base64 data URL for vision LLM consumption
81
+ * Convert a MediaObject to base64 image data for vision LLM consumption
74
82
  * @param mediaObject - MediaObject from camera.takePicture()
75
- * @returns Base64 data URL (data:image/jpeg;base64,...) or null if conversion fails
83
+ * @returns ImageData with raw base64 and media type, or null if conversion fails
76
84
  */
77
- export async function mediaObjectToBase64(mediaObject: MediaObject): Promise<string | null> {
85
+ export async function mediaObjectToBase64(mediaObject: MediaObject): Promise<ImageData | null> {
78
86
  try {
79
87
  // Convert MediaObject to Buffer using mediaManager
80
88
  const buffer = await mediaManager.convertMediaObjectToBuffer(mediaObject, ScryptedMimeTypes.Image);
81
89
 
82
- // Convert buffer to base64
90
+ // Convert buffer to base64 (raw, no data URL prefix)
83
91
  const base64 = buffer.toString('base64');
84
92
 
85
93
  // Determine MIME type - default to JPEG for camera images
86
- const mimeType = mediaObject.mimeType?.split(';')[0] || 'image/jpeg';
94
+ const mediaType = mediaObject.mimeType?.split(';')[0] || 'image/jpeg';
87
95
 
88
- return `data:${mimeType};base64,${base64}`;
96
+ return { base64, mediaType };
89
97
  } catch (e) {
90
98
  console.warn('Failed to convert MediaObject to base64:', e);
91
99
  return null;
92
100
  }
93
101
  }
94
102
 
103
+ /**
104
+ * Build image content block for ChatCompletion API
105
+ * Compatible with both OpenAI and Anthropic formats via @scrypted/llm
106
+ */
107
+ export function buildImageContent(imageData: ImageData): any {
108
+ // Use Anthropic's native format which @scrypted/llm should translate
109
+ // This format is more explicit about the base64 data
110
+ return {
111
+ type: 'image',
112
+ source: {
113
+ type: 'base64',
114
+ media_type: imageData.mediaType,
115
+ data: imageData.base64,
116
+ },
117
+ };
118
+ }
119
+
95
120
  export class SpatialReasoningEngine {
96
121
  private config: SpatialReasoningConfig;
97
122
  private console: Console;
@@ -751,7 +776,7 @@ export class SpatialReasoningEngine {
751
776
 
752
777
  try {
753
778
  // Convert image to base64 for vision LLM
754
- const imageBase64 = await mediaObjectToBase64(mediaObject);
779
+ const imageData = await mediaObjectToBase64(mediaObject);
755
780
 
756
781
  // Retrieve relevant context for RAG
757
782
  const relevantChunks = this.retrieveRelevantContext(
@@ -775,11 +800,11 @@ export class SpatialReasoningEngine {
775
800
 
776
801
  // Build message content - use multimodal format if we have an image
777
802
  let messageContent: any;
778
- if (imageBase64) {
779
- // Vision-capable multimodal message format (OpenAI compatible)
803
+ if (imageData) {
804
+ // Vision-capable multimodal message format (Anthropic native format)
780
805
  messageContent = [
781
806
  { type: 'text', text: prompt },
782
- { type: 'image_url', image_url: { url: imageBase64 } },
807
+ buildImageContent(imageData),
783
808
  ];
784
809
  } else {
785
810
  // Fallback to text-only if image conversion failed
@@ -863,7 +888,7 @@ Generate ONLY the description, nothing else:`;
863
888
 
864
889
  try {
865
890
  // Convert image to base64 for vision LLM
866
- const imageBase64 = await mediaObjectToBase64(mediaObject);
891
+ const imageData = await mediaObjectToBase64(mediaObject);
867
892
 
868
893
  const prompt = `Analyze this security camera image. A ${objectClass} was detected.
869
894
 
@@ -880,11 +905,11 @@ If no clear landmark is identifiable, respond with: {"name": null}`;
880
905
 
881
906
  // Build message content - use multimodal format if we have an image
882
907
  let messageContent: any;
883
- if (imageBase64) {
884
- // Vision-capable multimodal message format (OpenAI compatible)
908
+ if (imageData) {
909
+ // Vision-capable multimodal message format (Anthropic native format)
885
910
  messageContent = [
886
911
  { type: 'text', text: prompt },
887
- { type: 'image_url', image_url: { url: imageBase64 } },
912
+ buildImageContent(imageData),
888
913
  ];
889
914
  } else {
890
915
  // Fallback to text-only if image conversion failed
@@ -30,7 +30,7 @@ import {
30
30
  Landmark,
31
31
  findCamera,
32
32
  } from '../models/topology';
33
- import { mediaObjectToBase64 } from './spatial-reasoning';
33
+ import { mediaObjectToBase64, buildImageContent, ImageData } from './spatial-reasoning';
34
34
 
35
35
  const { systemManager } = sdk;
36
36
 
@@ -191,8 +191,8 @@ export class TopologyDiscoveryEngine {
191
191
  return null;
192
192
  }
193
193
 
194
- /** Get camera snapshot as base64 */
195
- private async getCameraSnapshot(cameraId: string): Promise<string | null> {
194
+ /** Get camera snapshot as ImageData */
195
+ private async getCameraSnapshot(cameraId: string): Promise<ImageData | null> {
196
196
  try {
197
197
  const camera = systemManager.getDeviceById<Camera>(cameraId);
198
198
  if (!camera?.interfaces?.includes(ScryptedInterface.Camera)) {
@@ -230,21 +230,21 @@ export class TopologyDiscoveryEngine {
230
230
  return analysis;
231
231
  }
232
232
 
233
- const imageBase64 = await this.getCameraSnapshot(cameraId);
234
- if (!imageBase64) {
233
+ const imageData = await this.getCameraSnapshot(cameraId);
234
+ if (!imageData) {
235
235
  analysis.error = 'Failed to capture camera snapshot';
236
236
  return analysis;
237
237
  }
238
238
 
239
239
  try {
240
- // Build multimodal message
240
+ // Build multimodal message with Anthropic-native format
241
241
  const result = await llm.getChatCompletion({
242
242
  messages: [
243
243
  {
244
244
  role: 'user',
245
245
  content: [
246
246
  { type: 'text', text: SCENE_ANALYSIS_PROMPT },
247
- { type: 'image_url', image_url: { url: imageBase64 } },
247
+ buildImageContent(imageData),
248
248
  ],
249
249
  },
250
250
  ],
@@ -387,6 +387,14 @@ export class TopologyDiscoveryEngine {
387
387
  this.status.camerasAnalyzed = analyses.length;
388
388
  this.console.log(`[Discovery] Analyzed ${analyses.length} cameras successfully`);
389
389
 
390
+ // Handle case where no cameras were successfully analyzed
391
+ if (analyses.length === 0) {
392
+ this.console.warn('[Discovery] No cameras were successfully analyzed');
393
+ this.status.lastError = 'No cameras were successfully analyzed - check LLM configuration';
394
+ this.status.lastScanTime = Date.now();
395
+ return null;
396
+ }
397
+
390
398
  // Correlate if we have multiple cameras
391
399
  let correlation: TopologyCorrelation | null = null;
392
400
  if (analyses.length >= 2) {
@@ -394,7 +402,7 @@ export class TopologyDiscoveryEngine {
394
402
  if (correlation) {
395
403
  this.generateSuggestionsFromCorrelation(correlation);
396
404
  }
397
- } else {
405
+ } else if (analyses.length === 1) {
398
406
  // Single camera - generate suggestions from its analysis
399
407
  this.generateSuggestionsFromAnalysis(analyses[0]);
400
408
  }