npm - @blueharford/scrypted-spatial-awareness - Versions diffs - 0.5.3 → 0.5.4 - Mend

@blueharford/scrypted-spatial-awareness 0.5.3 → 0.5.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (9) hide show

package/dist/main.nodejs.js +1 -1
package/dist/main.nodejs.js.map +1 -1
package/dist/plugin.zip +0 -0
package/out/main.nodejs.js +121 -65
package/out/main.nodejs.js.map +1 -1
package/out/plugin.zip +0 -0
package/package.json +1 -1
package/src/core/spatial-reasoning.ts +21 -5
package/src/core/topology-discovery.ts +114 -66

package/out/plugin.zip CHANGED Viewed

Binary file

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "@blueharford/scrypted-spatial-awareness",
-  "version": "0.5.3",
+  "version": "0.5.4",
   "description": "Cross-camera object tracking for Scrypted NVR with spatial awareness",
   "author": "Joshua Seidel <blueharford>",
   "license": "Apache-2.0",

package/src/core/spatial-reasoning.ts CHANGED Viewed

@@ -112,10 +112,12 @@ export type LlmProvider = 'openai' | 'anthropic' | 'unknown';
 export function buildImageContent(imageData: ImageData, provider: LlmProvider = 'unknown'): any {
   if (provider === 'openai') {
     // OpenAI format: uses data URL with image_url wrapper
+    // Include detail parameter for compatibility
     return {
       type: 'image_url',
       image_url: {
         url: `data:${imageData.mediaType};base64,${imageData.base64}`,
+        detail: 'auto',
       },
     };
   } else if (provider === 'anthropic') {
@@ -129,17 +131,31 @@ export function buildImageContent(imageData: ImageData, provider: LlmProvider =
       },
     };
   } else {
-    // Unknown provider: try OpenAI format as it's more commonly supported
-    // Most LLM wrappers (including @scrypted/llm) understand the OpenAI format
+    // Unknown provider: try Anthropic format first as it's more explicit
+    // Some plugins may translate this to OpenAI format internally
     return {
-      type: 'image_url',
-      image_url: {
-        url: `data:${imageData.mediaType};base64,${imageData.base64}`,
+      type: 'image',
+      source: {
+        type: 'base64',
+        media_type: imageData.mediaType,
+        data: imageData.base64,
       },
     };
   }
 }
+/** Check if an error indicates vision/multimodal content is not supported */
+export function isVisionNotSupportedError(error: any): boolean {
+  const errorStr = String(error);
+  return (
+    errorStr.includes('content.str') ||
+    errorStr.includes('should be a valid string') ||
+    errorStr.includes('Invalid content type') ||
+    errorStr.includes('does not support vision') ||
+    errorStr.includes('image_url') && errorStr.includes('not supported')
+  );
+}
 export class SpatialReasoningEngine {
   private config: SpatialReasoningConfig;
   private console: Console;

package/src/core/topology-discovery.ts CHANGED Viewed

@@ -30,7 +30,7 @@ import {
   Landmark,
   findCamera,
 } from '../models/topology';
-import { mediaObjectToBase64, buildImageContent, ImageData, LlmProvider } from './spatial-reasoning';
+import { mediaObjectToBase64, buildImageContent, ImageData, LlmProvider, isVisionNotSupportedError } from './spatial-reasoning';
 const { systemManager } = sdk;
@@ -253,77 +253,125 @@ export class TopologyDiscoveryEngine {
       return analysis;
     }
-    try {
-      // Build multimodal message with provider-specific image format
-      const result = await llm.getChatCompletion({
-        messages: [
-          {
-            role: 'user',
-            content: [
-              { type: 'text', text: SCENE_ANALYSIS_PROMPT },
-              buildImageContent(imageData, this.llmProviderType),
-            ],
-          },
-        ],
-        max_tokens: 500,
-        temperature: 0.3,
-      });
-      const content = result?.choices?.[0]?.message?.content;
-      if (content && typeof content === 'string') {
-        try {
-          // Extract JSON from response (handle markdown code blocks)
-          let jsonStr = content.trim();
-          if (jsonStr.startsWith('```')) {
-            jsonStr = jsonStr.replace(/```json?\n?/g, '').replace(/```$/g, '').trim();
-          }
-          const parsed = JSON.parse(jsonStr);
-          // Map parsed data to our types
-          if (Array.isArray(parsed.landmarks)) {
-            analysis.landmarks = parsed.landmarks.map((l: any) => ({
-              name: l.name || 'Unknown',
-              type: this.mapLandmarkType(l.type),
-              confidence: typeof l.confidence === 'number' ? l.confidence : 0.7,
-              description: l.description || '',
-              boundingBox: l.boundingBox,
-            }));
-          }
+    // Try with detected provider format first, then fallback to alternate format
+    const formatsToTry: LlmProvider[] = [this.llmProviderType];
+    // Add fallback format
+    if (this.llmProviderType === 'openai') {
+      formatsToTry.push('anthropic');
+    } else if (this.llmProviderType === 'anthropic') {
+      formatsToTry.push('openai');
+    } else {
+      // Unknown - try both
+      formatsToTry.push('openai');
+    }
-          if (Array.isArray(parsed.zones)) {
-            analysis.zones = parsed.zones.map((z: any) => ({
-              name: z.name || 'Unknown',
-              type: this.mapZoneType(z.type),
-              coverage: typeof z.coverage === 'number' ? z.coverage : 0.5,
-              description: z.description || '',
-              boundingBox: z.boundingBox,
-            }));
+    let lastError: any = null;
+    for (const formatType of formatsToTry) {
+      try {
+        this.console.log(`[Discovery] Trying ${formatType} image format for ${cameraName}...`);
+        // Build multimodal message with provider-specific image format
+        const result = await llm.getChatCompletion({
+          messages: [
+            {
+              role: 'user',
+              content: [
+                { type: 'text', text: SCENE_ANALYSIS_PROMPT },
+                buildImageContent(imageData, formatType),
+              ],
+            },
+          ],
+          max_tokens: 500,
+          temperature: 0.3,
+        });
+        const content = result?.choices?.[0]?.message?.content;
+        if (content && typeof content === 'string') {
+          try {
+            // Extract JSON from response (handle markdown code blocks)
+            let jsonStr = content.trim();
+            if (jsonStr.startsWith('```')) {
+              jsonStr = jsonStr.replace(/```json?\n?/g, '').replace(/```$/g, '').trim();
+            }
+            const parsed = JSON.parse(jsonStr);
+            // Map parsed data to our types
+            if (Array.isArray(parsed.landmarks)) {
+              analysis.landmarks = parsed.landmarks.map((l: any) => ({
+                name: l.name || 'Unknown',
+                type: this.mapLandmarkType(l.type),
+                confidence: typeof l.confidence === 'number' ? l.confidence : 0.7,
+                description: l.description || '',
+                boundingBox: l.boundingBox,
+              }));
+            }
+            if (Array.isArray(parsed.zones)) {
+              analysis.zones = parsed.zones.map((z: any) => ({
+                name: z.name || 'Unknown',
+                type: this.mapZoneType(z.type),
+                coverage: typeof z.coverage === 'number' ? z.coverage : 0.5,
+                description: z.description || '',
+                boundingBox: z.boundingBox,
+              }));
+            }
+            if (parsed.edges && typeof parsed.edges === 'object') {
+              analysis.edges = {
+                top: parsed.edges.top || '',
+                left: parsed.edges.left || '',
+                right: parsed.edges.right || '',
+                bottom: parsed.edges.bottom || '',
+              };
+            }
+            if (parsed.orientation) {
+              analysis.orientation = this.mapOrientation(parsed.orientation);
+            }
+            analysis.isValid = true;
+            this.console.log(`[Discovery] Analyzed ${cameraName}: ${analysis.landmarks.length} landmarks, ${analysis.zones.length} zones (using ${formatType} format)`);
+            // Update the preferred format for future requests
+            if (formatType !== this.llmProviderType) {
+              this.console.log(`[Discovery] Switching to ${formatType} format for future requests`);
+              this.llmProviderType = formatType;
+            }
+            // Success - exit the retry loop
+            return analysis;
+          } catch (parseError) {
+            this.console.warn(`[Discovery] Failed to parse LLM response for ${cameraName}:`, parseError);
+            analysis.error = 'Failed to parse LLM response';
+            return analysis;
           }
+        }
+      } catch (e) {
+        lastError = e;
-          if (parsed.edges && typeof parsed.edges === 'object') {
-            analysis.edges = {
-              top: parsed.edges.top || '',
-              left: parsed.edges.left || '',
-              right: parsed.edges.right || '',
-              bottom: parsed.edges.bottom || '',
-            };
-          }
+        // Check if this is a vision/multimodal format error
+        if (isVisionNotSupportedError(e)) {
+          this.console.warn(`[Discovery] ${formatType} format not supported, trying fallback...`);
+          continue; // Try next format
+        }
-          if (parsed.orientation) {
-            analysis.orientation = this.mapOrientation(parsed.orientation);
-          }
+        // Not a format error - don't retry
+        this.console.warn(`[Discovery] Scene analysis failed for ${cameraName}:`, e);
+        break;
+      }
+    }
-          analysis.isValid = true;
-          this.console.log(`[Discovery] Analyzed ${cameraName}: ${analysis.landmarks.length} landmarks, ${analysis.zones.length} zones`);
-        } catch (parseError) {
-          this.console.warn(`[Discovery] Failed to parse LLM response for ${cameraName}:`, parseError);
-          analysis.error = 'Failed to parse LLM response';
-        }
+    // All formats failed
+    if (lastError) {
+      const errorStr = String(lastError);
+      if (isVisionNotSupportedError(lastError)) {
+        analysis.error = 'Vision/image analysis not supported by configured LLM. Ensure you have a vision-capable model (e.g., gpt-4o, gpt-4-turbo, claude-3-sonnet) configured.';
+      } else {
+        analysis.error = `Analysis failed: ${errorStr}`;
       }
-    } catch (e) {
-      this.console.warn(`[Discovery] Scene analysis failed for ${cameraName}:`, e);
-      analysis.error = `Analysis failed: ${e}`;
     }
     // Cache the analysis