npm - @blueharford/scrypted-spatial-awareness - Versions diffs - 0.5.1 → 0.5.2 - Mend

@blueharford/scrypted-spatial-awareness 0.5.1 → 0.5.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (9) hide show

package/dist/main.nodejs.js +1 -1
package/dist/main.nodejs.js.map +1 -1
package/dist/plugin.zip +0 -0
package/out/main.nodejs.js +43 -19
package/out/main.nodejs.js.map +1 -1
package/out/plugin.zip +0 -0
package/package.json +1 -1
package/src/core/spatial-reasoning.ts +39 -14
package/src/core/topology-discovery.ts +16 -8

package/dist/plugin.zip CHANGED Viewed

Binary file

package/out/main.nodejs.js CHANGED Viewed

@@ -35094,29 +35094,46 @@ var __importStar = (this && this.__importStar) || (function () {
 Object.defineProperty(exports, "__esModule", ({ value: true }));
 exports.SpatialReasoningEngine = void 0;
 exports.mediaObjectToBase64 = mediaObjectToBase64;
+exports.buildImageContent = buildImageContent;
 const sdk_1 = __importStar(__webpack_require__(/*! @scrypted/sdk */ "./node_modules/@scrypted/sdk/dist/src/index.js"));
 const topology_1 = __webpack_require__(/*! ../models/topology */ "./src/models/topology.ts");
 const { systemManager, mediaManager } = sdk_1.default;
 /**
- * Convert a MediaObject to a base64 data URL for vision LLM consumption
+ * Convert a MediaObject to base64 image data for vision LLM consumption
  * @param mediaObject - MediaObject from camera.takePicture()
- * @returns Base64 data URL (data:image/jpeg;base64,...) or null if conversion fails
+ * @returns ImageData with raw base64 and media type, or null if conversion fails
  */
 async function mediaObjectToBase64(mediaObject) {
     try {
         // Convert MediaObject to Buffer using mediaManager
         const buffer = await mediaManager.convertMediaObjectToBuffer(mediaObject, sdk_1.ScryptedMimeTypes.Image);
-        // Convert buffer to base64
+        // Convert buffer to base64 (raw, no data URL prefix)
         const base64 = buffer.toString('base64');
         // Determine MIME type - default to JPEG for camera images
-        const mimeType = mediaObject.mimeType?.split(';')[0] || 'image/jpeg';
-        return `data:${mimeType};base64,${base64}`;
+        const mediaType = mediaObject.mimeType?.split(';')[0] || 'image/jpeg';
+        return { base64, mediaType };
     }
     catch (e) {
         console.warn('Failed to convert MediaObject to base64:', e);
         return null;
     }
 }
+/**
+ * Build image content block for ChatCompletion API
+ * Compatible with both OpenAI and Anthropic formats via @scrypted/llm
+ */
+function buildImageContent(imageData) {
+    // Use Anthropic's native format which @scrypted/llm should translate
+    // This format is more explicit about the base64 data
+    return {
+        type: 'image',
+        source: {
+            type: 'base64',
+            media_type: imageData.mediaType,
+            data: imageData.base64,
+        },
+    };
+}
 class SpatialReasoningEngine {
     config;
     console;
@@ -35661,7 +35678,7 @@ class SpatialReasoningEngine {
             return null;
         try {
             // Convert image to base64 for vision LLM
-            const imageBase64 = await mediaObjectToBase64(mediaObject);
+            const imageData = await mediaObjectToBase64(mediaObject);
             // Retrieve relevant context for RAG
             const relevantChunks = this.retrieveRelevantContext(fromCamera.deviceId, toCamera.deviceId);
             // Build RAG context
@@ -35670,11 +35687,11 @@ class SpatialReasoningEngine {
             const prompt = this.buildLlmPrompt(tracked, fromCamera, toCamera, transitTime, fromLandmarks, toLandmarks, ragContext);
             // Build message content - use multimodal format if we have an image
             let messageContent;
-            if (imageBase64) {
-                // Vision-capable multimodal message format (OpenAI compatible)
+            if (imageData) {
+                // Vision-capable multimodal message format (Anthropic native format)
                 messageContent = [
                     { type: 'text', text: prompt },
-                    { type: 'image_url', image_url: { url: imageBase64 } },
+                    buildImageContent(imageData),
                 ];
             }
             else {
@@ -35741,7 +35758,7 @@ Generate ONLY the description, nothing else:`;
             return null;
         try {
             // Convert image to base64 for vision LLM
-            const imageBase64 = await mediaObjectToBase64(mediaObject);
+            const imageData = await mediaObjectToBase64(mediaObject);
             const prompt = `Analyze this security camera image. A ${objectClass} was detected.
 Looking at the surroundings and environment, identify any notable landmarks or features visible that could help describe this location. Consider:
@@ -35756,11 +35773,11 @@ If you can identify a clear landmark feature, respond with ONLY a JSON object:
 If no clear landmark is identifiable, respond with: {"name": null}`;
             // Build message content - use multimodal format if we have an image
             let messageContent;
-            if (imageBase64) {
-                // Vision-capable multimodal message format (OpenAI compatible)
+            if (imageData) {
+                // Vision-capable multimodal message format (Anthropic native format)
                 messageContent = [
                     { type: 'text', text: prompt },
-                    { type: 'image_url', image_url: { url: imageBase64 } },
+                    buildImageContent(imageData),
                 ];
             }
             else {
@@ -36075,7 +36092,7 @@ class TopologyDiscoveryEngine {
         }
         return null;
     }
-    /** Get camera snapshot as base64 */
+    /** Get camera snapshot as ImageData */
     async getCameraSnapshot(cameraId) {
         try {
             const camera = systemManager.getDeviceById(cameraId);
@@ -36110,20 +36127,20 @@ class TopologyDiscoveryEngine {
             analysis.error = 'No LLM device available';
             return analysis;
         }
-        const imageBase64 = await this.getCameraSnapshot(cameraId);
-        if (!imageBase64) {
+        const imageData = await this.getCameraSnapshot(cameraId);
+        if (!imageData) {
             analysis.error = 'Failed to capture camera snapshot';
             return analysis;
         }
         try {
-            // Build multimodal message
+            // Build multimodal message with Anthropic-native format
             const result = await llm.getChatCompletion({
                 messages: [
                     {
                         role: 'user',
                         content: [
                             { type: 'text', text: SCENE_ANALYSIS_PROMPT },
-                            { type: 'image_url', image_url: { url: imageBase64 } },
+                            (0, spatial_reasoning_1.buildImageContent)(imageData),
                         ],
                     },
                 ],
@@ -36258,6 +36275,13 @@ class TopologyDiscoveryEngine {
             }
             this.status.camerasAnalyzed = analyses.length;
             this.console.log(`[Discovery] Analyzed ${analyses.length} cameras successfully`);
+            // Handle case where no cameras were successfully analyzed
+            if (analyses.length === 0) {
+                this.console.warn('[Discovery] No cameras were successfully analyzed');
+                this.status.lastError = 'No cameras were successfully analyzed - check LLM configuration';
+                this.status.lastScanTime = Date.now();
+                return null;
+            }
             // Correlate if we have multiple cameras
             let correlation = null;
             if (analyses.length >= 2) {
@@ -36266,7 +36290,7 @@ class TopologyDiscoveryEngine {
                     this.generateSuggestionsFromCorrelation(correlation);
                 }
             }
-            else {
+            else if (analyses.length === 1) {
                 // Single camera - generate suggestions from its analysis
                 this.generateSuggestionsFromAnalysis(analyses[0]);
             }