npm - @blueharford/scrypted-spatial-awareness - Versions diffs - 0.6.9 → 0.6.11 - Mend

@blueharford/scrypted-spatial-awareness 0.6.9 → 0.6.11

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (8) hide show

package/dist/main.nodejs.js +1 -1
package/dist/main.nodejs.js.map +1 -1
package/dist/plugin.zip +0 -0
package/out/main.nodejs.js +59 -66
package/out/main.nodejs.js.map +1 -1
package/out/plugin.zip +0 -0
package/package.json +1 -1
package/src/core/spatial-reasoning.ts +60 -63

package/out/plugin.zip CHANGED Viewed

Binary file

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "@blueharford/scrypted-spatial-awareness",
-  "version": "0.6.9",
+  "version": "0.6.11",
   "description": "Cross-camera object tracking for Scrypted NVR with spatial awareness",
   "author": "Joshua Seidel <blueharford>",
   "license": "Apache-2.0",

package/src/core/spatial-reasoning.ts CHANGED Viewed

@@ -154,51 +154,23 @@ export type LlmProvider = 'openai' | 'anthropic' | 'scrypted' | 'unknown';
 /**
  * Build image content block for ChatCompletion API
- * Supports OpenAI, Anthropic, and @scrypted/llm formats
+ *
+ * IMPORTANT: @scrypted/llm uses OpenAI-compatible format for ALL providers.
+ * The plugin internally converts this format to the appropriate provider format.
+ * So we ALWAYS use the OpenAI image_url format with data URI.
+ *
  * @param imageData - Image data with base64 and media type
- * @param provider - The LLM provider type
+ * @param provider - The LLM provider type (currently unused, kept for logging)
  */
 export function buildImageContent(imageData: ImageData, provider: LlmProvider = 'unknown'): any {
-  if (provider === 'openai') {
-    // OpenAI format: uses data URL with image_url wrapper
-    return {
-      type: 'image_url',
-      image_url: {
-        url: `data:${imageData.mediaType};base64,${imageData.base64}`,
-        detail: 'auto',
-      },
-    };
-  } else if (provider === 'anthropic') {
-    // Anthropic official format: uses 'data' key
-    return {
-      type: 'image',
-      source: {
-        type: 'base64',
-        media_type: imageData.mediaType,
-        data: imageData.base64,
-      },
-    };
-  } else if (provider === 'scrypted') {
-    // @scrypted/llm format: uses 'base64' key (per error path .image.source.base64)
-    return {
-      type: 'image',
-      source: {
-        type: 'base64',
-        media_type: imageData.mediaType,
-        base64: imageData.base64,
-      },
-    };
-  } else {
-    // Unknown provider: try @scrypted/llm format first
-    return {
-      type: 'image',
-      source: {
-        type: 'base64',
-        media_type: imageData.mediaType,
-        base64: imageData.base64,
-      },
-    };
-  }
+  // @scrypted/llm uses OpenAI-compatible format for ALL providers
+  // The plugin handles internal conversion to Anthropic/other formats
+  return {
+    type: 'image_url',
+    image_url: {
+      url: `data:${imageData.mediaType};base64,${imageData.base64}`,
+    },
+  };
 }
 /** Check if an error indicates vision/multimodal content format issue (should try alternate format) */
@@ -1077,33 +1049,58 @@ Examples of good descriptions:
 Generate ONLY the description, nothing else:`;
-      // Build message content - use multimodal format if we have an image
-      let messageContent: any;
+      // Try multimodal format first, fall back to text-only if it fails
+      let result: any;
+      let usedVision = false;
       if (imageData) {
-        messageContent = [
-          { type: 'text', text: prompt },
-          buildImageContent(imageData, this.llmProviderType),
-        ];
-      } else {
-        messageContent = prompt;
+        // First attempt: Try multimodal with image
+        try {
+          this.console.log(`[LLM] Attempting multimodal ${eventType} call with image...`);
+          const multimodalContent = [
+            { type: 'text', text: prompt },
+            buildImageContent(imageData, this.llmProviderType),
+          ];
+          result = await llm.getChatCompletion({
+            messages: [
+              {
+                role: 'user',
+                content: multimodalContent,
+              },
+            ],
+            max_tokens: 100,
+            temperature: 0.7,
+          });
+          usedVision = true;
+        } catch (visionError: any) {
+          // If vision format fails, try text-only
+          if (isVisionFormatError(visionError)) {
+            this.console.warn(`[LLM] Vision format not supported, falling back to text-only: ${visionError.message || visionError}`);
+          } else {
+            this.console.warn(`[LLM] Multimodal call failed, trying text-only: ${visionError.message || visionError}`);
+          }
+        }
       }
-      // Call LLM using ChatCompletion interface
-      this.console.log(`[LLM] Calling getChatCompletion for ${eventType}...`);
-      const result = await llm.getChatCompletion({
-        messages: [
-          {
-            role: 'user',
-            content: messageContent,
-          },
-        ],
-        max_tokens: 100,
-        temperature: 0.7,
-      });
+      // If no result yet, try text-only
+      if (!result) {
+        this.console.log(`[LLM] Calling text-only getChatCompletion for ${eventType}...`);
+        result = await llm.getChatCompletion({
+          messages: [
+            {
+              role: 'user',
+              content: prompt,
+            },
+          ],
+          max_tokens: 100,
+          temperature: 0.7,
+        });
+      }
       const content = result?.choices?.[0]?.message?.content;
       if (content && typeof content === 'string') {
-        this.console.log(`[LLM] Got ${eventType} description: ${content.trim().substring(0, 50)}...`);
+        this.console.log(`[LLM] Got ${eventType} description (vision=${usedVision}): ${content.trim().substring(0, 50)}...`);
         return content.trim();
       }