npm - @blueharford/scrypted-spatial-awareness - Versions diffs - 0.5.1 → 0.5.3 - Mend

@blueharford/scrypted-spatial-awareness 0.5.1 → 0.5.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (9) hide show

package/dist/main.nodejs.js +1 -1
package/dist/main.nodejs.js.map +1 -1
package/dist/plugin.zip +0 -0
package/out/main.nodejs.js +101 -23
package/out/main.nodejs.js.map +1 -1
package/out/plugin.zip +0 -0
package/package.json +1 -1
package/src/core/spatial-reasoning.ts +81 -18
package/src/core/topology-discovery.ts +33 -8

package/out/plugin.zip CHANGED Viewed

Binary file

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "@blueharford/scrypted-spatial-awareness",
-  "version": "0.5.1",
+  "version": "0.5.3",
   "description": "Cross-camera object tracking for Scrypted NVR with spatial awareness",
   "author": "Joshua Seidel <blueharford>",
   "license": "Apache-2.0",

package/src/core/spatial-reasoning.ts CHANGED Viewed

@@ -69,29 +69,77 @@ interface ChatCompletionDevice extends ScryptedDevice {
   streamChatCompletion?(params: any): AsyncGenerator<any>;
 }
+/** Image data for LLM vision APIs */
+export interface ImageData {
+  /** Raw base64 encoded image data (no data URL prefix) */
+  base64: string;
+  /** MIME type (e.g., 'image/jpeg') */
+  mediaType: string;
+}
 /**
- * Convert a MediaObject to a base64 data URL for vision LLM consumption
+ * Convert a MediaObject to base64 image data for vision LLM consumption
  * @param mediaObject - MediaObject from camera.takePicture()
- * @returns Base64 data URL (data:image/jpeg;base64,...) or null if conversion fails
+ * @returns ImageData with raw base64 and media type, or null if conversion fails
  */
-export async function mediaObjectToBase64(mediaObject: MediaObject): Promise<string | null> {
+export async function mediaObjectToBase64(mediaObject: MediaObject): Promise<ImageData | null> {
   try {
     // Convert MediaObject to Buffer using mediaManager
     const buffer = await mediaManager.convertMediaObjectToBuffer(mediaObject, ScryptedMimeTypes.Image);
-    // Convert buffer to base64
+    // Convert buffer to base64 (raw, no data URL prefix)
     const base64 = buffer.toString('base64');
     // Determine MIME type - default to JPEG for camera images
-    const mimeType = mediaObject.mimeType?.split(';')[0] || 'image/jpeg';
+    const mediaType = mediaObject.mimeType?.split(';')[0] || 'image/jpeg';
-    return `data:${mimeType};base64,${base64}`;
+    return { base64, mediaType };
   } catch (e) {
     console.warn('Failed to convert MediaObject to base64:', e);
     return null;
   }
 }
+/** LLM Provider type for image format selection */
+export type LlmProvider = 'openai' | 'anthropic' | 'unknown';
+/**
+ * Build image content block for ChatCompletion API
+ * Supports both OpenAI and Anthropic formats
+ * @param imageData - Image data with base64 and media type
+ * @param provider - The LLM provider type (openai, anthropic, or unknown)
+ */
+export function buildImageContent(imageData: ImageData, provider: LlmProvider = 'unknown'): any {
+  if (provider === 'openai') {
+    // OpenAI format: uses data URL with image_url wrapper
+    return {
+      type: 'image_url',
+      image_url: {
+        url: `data:${imageData.mediaType};base64,${imageData.base64}`,
+      },
+    };
+  } else if (provider === 'anthropic') {
+    // Anthropic format: uses separate base64 data and media_type
+    return {
+      type: 'image',
+      source: {
+        type: 'base64',
+        media_type: imageData.mediaType,
+        data: imageData.base64,
+      },
+    };
+  } else {
+    // Unknown provider: try OpenAI format as it's more commonly supported
+    // Most LLM wrappers (including @scrypted/llm) understand the OpenAI format
+    return {
+      type: 'image_url',
+      image_url: {
+        url: `data:${imageData.mediaType};base64,${imageData.base64}`,
+      },
+    };
+  }
+}
 export class SpatialReasoningEngine {
   private config: SpatialReasoningConfig;
   private console: Console;
@@ -336,6 +384,7 @@ export class SpatialReasoningEngine {
   private llmSearched: boolean = false;
   private llmProvider: string | null = null;
+  private llmProviderType: LlmProvider = 'unknown';
   /** Find or initialize LLM device - looks for ChatCompletion interface from @scrypted/llm plugin */
   private async findLlmDevice(): Promise<ChatCompletionDevice | null> {
@@ -356,27 +405,36 @@ export class SpatialReasoningEngine {
           const deviceName = device.name?.toLowerCase() || '';
           const pluginId = (device as any).pluginId?.toLowerCase() || '';
-          // Identify the provider type for logging
+          // Identify the provider type for logging and image format selection
           let providerType = 'Unknown';
-          if (pluginId.includes('@scrypted/llm') || pluginId.includes('llm')) {
-            providerType = 'Scrypted LLM';
-          }
+          let providerTypeEnum: LlmProvider = 'unknown';
           if (deviceName.includes('openai') || deviceName.includes('gpt')) {
             providerType = 'OpenAI';
+            providerTypeEnum = 'openai';
           } else if (deviceName.includes('anthropic') || deviceName.includes('claude')) {
             providerType = 'Anthropic';
+            providerTypeEnum = 'anthropic';
           } else if (deviceName.includes('ollama')) {
             providerType = 'Ollama';
+            providerTypeEnum = 'openai'; // Ollama uses OpenAI-compatible format
           } else if (deviceName.includes('gemini') || deviceName.includes('google')) {
             providerType = 'Google';
+            providerTypeEnum = 'openai'; // Google uses OpenAI-compatible format
           } else if (deviceName.includes('llama')) {
             providerType = 'llama.cpp';
+            providerTypeEnum = 'openai'; // llama.cpp uses OpenAI-compatible format
+          } else if (pluginId.includes('@scrypted/llm') || pluginId.includes('llm')) {
+            providerType = 'Scrypted LLM';
+            providerTypeEnum = 'unknown';
           }
           this.llmDevice = device as unknown as ChatCompletionDevice;
           this.llmProvider = `${providerType} (${device.name})`;
+          this.llmProviderType = providerTypeEnum;
           this.console.log(`[LLM] Connected to ${providerType}: ${device.name}`);
           this.console.log(`[LLM] Plugin: ${pluginId || 'N/A'}`);
+          this.console.log(`[LLM] Image format: ${providerTypeEnum}`);
           this.console.log(`[LLM] Interfaces: ${device.interfaces?.join(', ')}`);
           return this.llmDevice;
         }
@@ -398,6 +456,11 @@ export class SpatialReasoningEngine {
     return this.llmProvider;
   }
+  /** Get the current LLM provider type for image format selection */
+  getLlmProviderType(): LlmProvider {
+    return this.llmProviderType;
+  }
   /** Check if LLM is available */
   isLlmAvailable(): boolean {
     return this.llmDevice !== null;
@@ -751,7 +814,7 @@ export class SpatialReasoningEngine {
     try {
       // Convert image to base64 for vision LLM
-      const imageBase64 = await mediaObjectToBase64(mediaObject);
+      const imageData = await mediaObjectToBase64(mediaObject);
       // Retrieve relevant context for RAG
       const relevantChunks = this.retrieveRelevantContext(
@@ -775,11 +838,11 @@ export class SpatialReasoningEngine {
       // Build message content - use multimodal format if we have an image
       let messageContent: any;
-      if (imageBase64) {
-        // Vision-capable multimodal message format (OpenAI compatible)
+      if (imageData) {
+        // Vision-capable multimodal message format (provider-specific)
         messageContent = [
           { type: 'text', text: prompt },
-          { type: 'image_url', image_url: { url: imageBase64 } },
+          buildImageContent(imageData, this.llmProviderType),
         ];
       } else {
         // Fallback to text-only if image conversion failed
@@ -863,7 +926,7 @@ Generate ONLY the description, nothing else:`;
     try {
       // Convert image to base64 for vision LLM
-      const imageBase64 = await mediaObjectToBase64(mediaObject);
+      const imageData = await mediaObjectToBase64(mediaObject);
       const prompt = `Analyze this security camera image. A ${objectClass} was detected.
@@ -880,11 +943,11 @@ If no clear landmark is identifiable, respond with: {"name": null}`;
       // Build message content - use multimodal format if we have an image
       let messageContent: any;
-      if (imageBase64) {
-        // Vision-capable multimodal message format (OpenAI compatible)
+      if (imageData) {
+        // Vision-capable multimodal message format (provider-specific)
         messageContent = [
           { type: 'text', text: prompt },
-          { type: 'image_url', image_url: { url: imageBase64 } },
+          buildImageContent(imageData, this.llmProviderType),
         ];
       } else {
         // Fallback to text-only if image conversion failed

package/src/core/topology-discovery.ts CHANGED Viewed

@@ -30,7 +30,7 @@ import {
   Landmark,
   findCamera,
 } from '../models/topology';
-import { mediaObjectToBase64 } from './spatial-reasoning';
+import { mediaObjectToBase64, buildImageContent, ImageData, LlmProvider } from './spatial-reasoning';
 const { systemManager } = sdk;
@@ -100,6 +100,7 @@ export class TopologyDiscoveryEngine {
   private topology: CameraTopology | null = null;
   private llmDevice: ChatCompletionDevice | null = null;
   private llmSearched: boolean = false;
+  private llmProviderType: LlmProvider = 'unknown';
   // Scene analysis cache (camera ID -> analysis)
   private sceneCache: Map<string, SceneAnalysis> = new Map();
@@ -177,8 +178,24 @@ export class TopologyDiscoveryEngine {
         if (!device) continue;
         if (device.interfaces?.includes('ChatCompletion')) {
+          const deviceName = device.name?.toLowerCase() || '';
+          // Detect provider type for image format selection
+          if (deviceName.includes('openai') || deviceName.includes('gpt')) {
+            this.llmProviderType = 'openai';
+          } else if (deviceName.includes('anthropic') || deviceName.includes('claude')) {
+            this.llmProviderType = 'anthropic';
+          } else if (deviceName.includes('ollama') || deviceName.includes('gemini') ||
+                     deviceName.includes('google') || deviceName.includes('llama')) {
+            // These providers use OpenAI-compatible format
+            this.llmProviderType = 'openai';
+          } else {
+            this.llmProviderType = 'unknown';
+          }
           this.llmDevice = device as unknown as ChatCompletionDevice;
           this.console.log(`[Discovery] Connected to LLM: ${device.name}`);
+          this.console.log(`[Discovery] Image format: ${this.llmProviderType}`);
           return this.llmDevice;
         }
       }
@@ -191,8 +208,8 @@ export class TopologyDiscoveryEngine {
     return null;
   }
-  /** Get camera snapshot as base64 */
-  private async getCameraSnapshot(cameraId: string): Promise<string | null> {
+  /** Get camera snapshot as ImageData */
+  private async getCameraSnapshot(cameraId: string): Promise<ImageData | null> {
     try {
       const camera = systemManager.getDeviceById<Camera>(cameraId);
       if (!camera?.interfaces?.includes(ScryptedInterface.Camera)) {
@@ -230,21 +247,21 @@ export class TopologyDiscoveryEngine {
       return analysis;
     }
-    const imageBase64 = await this.getCameraSnapshot(cameraId);
-    if (!imageBase64) {
+    const imageData = await this.getCameraSnapshot(cameraId);
+    if (!imageData) {
       analysis.error = 'Failed to capture camera snapshot';
       return analysis;
     }
     try {
-      // Build multimodal message
+      // Build multimodal message with provider-specific image format
       const result = await llm.getChatCompletion({
         messages: [
           {
             role: 'user',
             content: [
               { type: 'text', text: SCENE_ANALYSIS_PROMPT },
-              { type: 'image_url', image_url: { url: imageBase64 } },
+              buildImageContent(imageData, this.llmProviderType),
             ],
           },
         ],
@@ -387,6 +404,14 @@ export class TopologyDiscoveryEngine {
       this.status.camerasAnalyzed = analyses.length;
       this.console.log(`[Discovery] Analyzed ${analyses.length} cameras successfully`);
+      // Handle case where no cameras were successfully analyzed
+      if (analyses.length === 0) {
+        this.console.warn('[Discovery] No cameras were successfully analyzed');
+        this.status.lastError = 'No cameras were successfully analyzed - check LLM configuration';
+        this.status.lastScanTime = Date.now();
+        return null;
+      }
       // Correlate if we have multiple cameras
       let correlation: TopologyCorrelation | null = null;
       if (analyses.length >= 2) {
@@ -394,7 +419,7 @@ export class TopologyDiscoveryEngine {
         if (correlation) {
           this.generateSuggestionsFromCorrelation(correlation);
         }
-      } else {
+      } else if (analyses.length === 1) {
         // Single camera - generate suggestions from its analysis
         this.generateSuggestionsFromAnalysis(analyses[0]);
       }