npm - @blueharford/scrypted-spatial-awareness - Versions diffs - 0.5.2 → 0.5.4 - Mend

@blueharford/scrypted-spatial-awareness 0.5.2 → 0.5.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (9) hide show

package/dist/main.nodejs.js +1 -1
package/dist/main.nodejs.js.map +1 -1
package/dist/plugin.zip +0 -0
package/out/main.nodejs.js +190 -80
package/out/main.nodejs.js.map +1 -1
package/out/plugin.zip +0 -0
package/package.json +1 -1
package/src/core/spatial-reasoning.ts +74 -20
package/src/core/topology-discovery.ts +131 -66

package/dist/plugin.zip CHANGED Viewed

Binary file

package/out/main.nodejs.js CHANGED Viewed

@@ -35095,6 +35095,7 @@ Object.defineProperty(exports, "__esModule", ({ value: true }));
 exports.SpatialReasoningEngine = void 0;
 exports.mediaObjectToBase64 = mediaObjectToBase64;
 exports.buildImageContent = buildImageContent;
+exports.isVisionNotSupportedError = isVisionNotSupportedError;
 const sdk_1 = __importStar(__webpack_require__(/*! @scrypted/sdk */ "./node_modules/@scrypted/sdk/dist/src/index.js"));
 const topology_1 = __webpack_require__(/*! ../models/topology */ "./src/models/topology.ts");
 const { systemManager, mediaManager } = sdk_1.default;
@@ -35120,19 +35121,54 @@ async function mediaObjectToBase64(mediaObject) {
 }
 /**
  * Build image content block for ChatCompletion API
- * Compatible with both OpenAI and Anthropic formats via @scrypted/llm
+ * Supports both OpenAI and Anthropic formats
+ * @param imageData - Image data with base64 and media type
+ * @param provider - The LLM provider type (openai, anthropic, or unknown)
  */
-function buildImageContent(imageData) {
-    // Use Anthropic's native format which @scrypted/llm should translate
-    // This format is more explicit about the base64 data
-    return {
-        type: 'image',
-        source: {
-            type: 'base64',
-            media_type: imageData.mediaType,
-            data: imageData.base64,
-        },
-    };
+function buildImageContent(imageData, provider = 'unknown') {
+    if (provider === 'openai') {
+        // OpenAI format: uses data URL with image_url wrapper
+        // Include detail parameter for compatibility
+        return {
+            type: 'image_url',
+            image_url: {
+                url: `data:${imageData.mediaType};base64,${imageData.base64}`,
+                detail: 'auto',
+            },
+        };
+    }
+    else if (provider === 'anthropic') {
+        // Anthropic format: uses separate base64 data and media_type
+        return {
+            type: 'image',
+            source: {
+                type: 'base64',
+                media_type: imageData.mediaType,
+                data: imageData.base64,
+            },
+        };
+    }
+    else {
+        // Unknown provider: try Anthropic format first as it's more explicit
+        // Some plugins may translate this to OpenAI format internally
+        return {
+            type: 'image',
+            source: {
+                type: 'base64',
+                media_type: imageData.mediaType,
+                data: imageData.base64,
+            },
+        };
+    }
+}
+/** Check if an error indicates vision/multimodal content is not supported */
+function isVisionNotSupportedError(error) {
+    const errorStr = String(error);
+    return (errorStr.includes('content.str') ||
+        errorStr.includes('should be a valid string') ||
+        errorStr.includes('Invalid content type') ||
+        errorStr.includes('does not support vision') ||
+        errorStr.includes('image_url') && errorStr.includes('not supported'));
 }
 class SpatialReasoningEngine {
     config;
@@ -35353,6 +35389,7 @@ class SpatialReasoningEngine {
     }
     llmSearched = false;
     llmProvider = null;
+    llmProviderType = 'unknown';
     /** Find or initialize LLM device - looks for ChatCompletion interface from @scrypted/llm plugin */
     async findLlmDevice() {
         if (this.llmDevice)
@@ -35371,30 +35408,39 @@ class SpatialReasoningEngine {
                 if (device.interfaces?.includes('ChatCompletion')) {
                     const deviceName = device.name?.toLowerCase() || '';
                     const pluginId = device.pluginId?.toLowerCase() || '';
-                    // Identify the provider type for logging
+                    // Identify the provider type for logging and image format selection
                     let providerType = 'Unknown';
-                    if (pluginId.includes('@scrypted/llm') || pluginId.includes('llm')) {
-                        providerType = 'Scrypted LLM';
-                    }
+                    let providerTypeEnum = 'unknown';
                     if (deviceName.includes('openai') || deviceName.includes('gpt')) {
                         providerType = 'OpenAI';
+                        providerTypeEnum = 'openai';
                     }
                     else if (deviceName.includes('anthropic') || deviceName.includes('claude')) {
                         providerType = 'Anthropic';
+                        providerTypeEnum = 'anthropic';
                     }
                     else if (deviceName.includes('ollama')) {
                         providerType = 'Ollama';
+                        providerTypeEnum = 'openai'; // Ollama uses OpenAI-compatible format
                     }
                     else if (deviceName.includes('gemini') || deviceName.includes('google')) {
                         providerType = 'Google';
+                        providerTypeEnum = 'openai'; // Google uses OpenAI-compatible format
                     }
                     else if (deviceName.includes('llama')) {
                         providerType = 'llama.cpp';
+                        providerTypeEnum = 'openai'; // llama.cpp uses OpenAI-compatible format
+                    }
+                    else if (pluginId.includes('@scrypted/llm') || pluginId.includes('llm')) {
+                        providerType = 'Scrypted LLM';
+                        providerTypeEnum = 'unknown';
                     }
                     this.llmDevice = device;
                     this.llmProvider = `${providerType} (${device.name})`;
+                    this.llmProviderType = providerTypeEnum;
                     this.console.log(`[LLM] Connected to ${providerType}: ${device.name}`);
                     this.console.log(`[LLM] Plugin: ${pluginId || 'N/A'}`);
+                    this.console.log(`[LLM] Image format: ${providerTypeEnum}`);
                     this.console.log(`[LLM] Interfaces: ${device.interfaces?.join(', ')}`);
                     return this.llmDevice;
                 }
@@ -35412,6 +35458,10 @@ class SpatialReasoningEngine {
     getLlmProvider() {
         return this.llmProvider;
     }
+    /** Get the current LLM provider type for image format selection */
+    getLlmProviderType() {
+        return this.llmProviderType;
+    }
     /** Check if LLM is available */
     isLlmAvailable() {
         return this.llmDevice !== null;
@@ -35688,10 +35738,10 @@ class SpatialReasoningEngine {
             // Build message content - use multimodal format if we have an image
             let messageContent;
             if (imageData) {
-                // Vision-capable multimodal message format (Anthropic native format)
+                // Vision-capable multimodal message format (provider-specific)
                 messageContent = [
                     { type: 'text', text: prompt },
-                    buildImageContent(imageData),
+                    buildImageContent(imageData, this.llmProviderType),
                 ];
             }
             else {
@@ -35774,10 +35824,10 @@ If no clear landmark is identifiable, respond with: {"name": null}`;
             // Build message content - use multimodal format if we have an image
             let messageContent;
             if (imageData) {
-                // Vision-capable multimodal message format (Anthropic native format)
+                // Vision-capable multimodal message format (provider-specific)
                 messageContent = [
                     { type: 'text', text: prompt },
-                    buildImageContent(imageData),
+                    buildImageContent(imageData, this.llmProviderType),
                 ];
             }
             else {
@@ -36017,6 +36067,7 @@ class TopologyDiscoveryEngine {
     topology = null;
     llmDevice = null;
     llmSearched = false;
+    llmProviderType = 'unknown';
     // Scene analysis cache (camera ID -> analysis)
     sceneCache = new Map();
     // Pending suggestions for user review
@@ -36080,8 +36131,25 @@ class TopologyDiscoveryEngine {
                 if (!device)
                     continue;
                 if (device.interfaces?.includes('ChatCompletion')) {
+                    const deviceName = device.name?.toLowerCase() || '';
+                    // Detect provider type for image format selection
+                    if (deviceName.includes('openai') || deviceName.includes('gpt')) {
+                        this.llmProviderType = 'openai';
+                    }
+                    else if (deviceName.includes('anthropic') || deviceName.includes('claude')) {
+                        this.llmProviderType = 'anthropic';
+                    }
+                    else if (deviceName.includes('ollama') || deviceName.includes('gemini') ||
+                        deviceName.includes('google') || deviceName.includes('llama')) {
+                        // These providers use OpenAI-compatible format
+                        this.llmProviderType = 'openai';
+                    }
+                    else {
+                        this.llmProviderType = 'unknown';
+                    }
                     this.llmDevice = device;
                     this.console.log(`[Discovery] Connected to LLM: ${device.name}`);
+                    this.console.log(`[Discovery] Image format: ${this.llmProviderType}`);
                     return this.llmDevice;
                 }
             }
@@ -36132,72 +36200,114 @@ class TopologyDiscoveryEngine {
             analysis.error = 'Failed to capture camera snapshot';
             return analysis;
         }
-        try {
-            // Build multimodal message with Anthropic-native format
-            const result = await llm.getChatCompletion({
-                messages: [
-                    {
-                        role: 'user',
-                        content: [
-                            { type: 'text', text: SCENE_ANALYSIS_PROMPT },
-                            (0, spatial_reasoning_1.buildImageContent)(imageData),
-                        ],
-                    },
-                ],
-                max_tokens: 500,
-                temperature: 0.3,
-            });
-            const content = result?.choices?.[0]?.message?.content;
-            if (content && typeof content === 'string') {
-                try {
-                    // Extract JSON from response (handle markdown code blocks)
-                    let jsonStr = content.trim();
-                    if (jsonStr.startsWith('```')) {
-                        jsonStr = jsonStr.replace(/```json?\n?/g, '').replace(/```$/g, '').trim();
-                    }
-                    const parsed = JSON.parse(jsonStr);
-                    // Map parsed data to our types
-                    if (Array.isArray(parsed.landmarks)) {
-                        analysis.landmarks = parsed.landmarks.map((l) => ({
-                            name: l.name || 'Unknown',
-                            type: this.mapLandmarkType(l.type),
-                            confidence: typeof l.confidence === 'number' ? l.confidence : 0.7,
-                            description: l.description || '',
-                            boundingBox: l.boundingBox,
-                        }));
-                    }
-                    if (Array.isArray(parsed.zones)) {
-                        analysis.zones = parsed.zones.map((z) => ({
-                            name: z.name || 'Unknown',
-                            type: this.mapZoneType(z.type),
-                            coverage: typeof z.coverage === 'number' ? z.coverage : 0.5,
-                            description: z.description || '',
-                            boundingBox: z.boundingBox,
-                        }));
-                    }
-                    if (parsed.edges && typeof parsed.edges === 'object') {
-                        analysis.edges = {
-                            top: parsed.edges.top || '',
-                            left: parsed.edges.left || '',
-                            right: parsed.edges.right || '',
-                            bottom: parsed.edges.bottom || '',
-                        };
+        // Try with detected provider format first, then fallback to alternate format
+        const formatsToTry = [this.llmProviderType];
+        // Add fallback format
+        if (this.llmProviderType === 'openai') {
+            formatsToTry.push('anthropic');
+        }
+        else if (this.llmProviderType === 'anthropic') {
+            formatsToTry.push('openai');
+        }
+        else {
+            // Unknown - try both
+            formatsToTry.push('openai');
+        }
+        let lastError = null;
+        for (const formatType of formatsToTry) {
+            try {
+                this.console.log(`[Discovery] Trying ${formatType} image format for ${cameraName}...`);
+                // Build multimodal message with provider-specific image format
+                const result = await llm.getChatCompletion({
+                    messages: [
+                        {
+                            role: 'user',
+                            content: [
+                                { type: 'text', text: SCENE_ANALYSIS_PROMPT },
+                                (0, spatial_reasoning_1.buildImageContent)(imageData, formatType),
+                            ],
+                        },
+                    ],
+                    max_tokens: 500,
+                    temperature: 0.3,
+                });
+                const content = result?.choices?.[0]?.message?.content;
+                if (content && typeof content === 'string') {
+                    try {
+                        // Extract JSON from response (handle markdown code blocks)
+                        let jsonStr = content.trim();
+                        if (jsonStr.startsWith('```')) {
+                            jsonStr = jsonStr.replace(/```json?\n?/g, '').replace(/```$/g, '').trim();
+                        }
+                        const parsed = JSON.parse(jsonStr);
+                        // Map parsed data to our types
+                        if (Array.isArray(parsed.landmarks)) {
+                            analysis.landmarks = parsed.landmarks.map((l) => ({
+                                name: l.name || 'Unknown',
+                                type: this.mapLandmarkType(l.type),
+                                confidence: typeof l.confidence === 'number' ? l.confidence : 0.7,
+                                description: l.description || '',
+                                boundingBox: l.boundingBox,
+                            }));
+                        }
+                        if (Array.isArray(parsed.zones)) {
+                            analysis.zones = parsed.zones.map((z) => ({
+                                name: z.name || 'Unknown',
+                                type: this.mapZoneType(z.type),
+                                coverage: typeof z.coverage === 'number' ? z.coverage : 0.5,
+                                description: z.description || '',
+                                boundingBox: z.boundingBox,
+                            }));
+                        }
+                        if (parsed.edges && typeof parsed.edges === 'object') {
+                            analysis.edges = {
+                                top: parsed.edges.top || '',
+                                left: parsed.edges.left || '',
+                                right: parsed.edges.right || '',
+                                bottom: parsed.edges.bottom || '',
+                            };
+                        }
+                        if (parsed.orientation) {
+                            analysis.orientation = this.mapOrientation(parsed.orientation);
+                        }
+                        analysis.isValid = true;
+                        this.console.log(`[Discovery] Analyzed ${cameraName}: ${analysis.landmarks.length} landmarks, ${analysis.zones.length} zones (using ${formatType} format)`);
+                        // Update the preferred format for future requests
+                        if (formatType !== this.llmProviderType) {
+                            this.console.log(`[Discovery] Switching to ${formatType} format for future requests`);
+                            this.llmProviderType = formatType;
+                        }
+                        // Success - exit the retry loop
+                        return analysis;
                     }
-                    if (parsed.orientation) {
-                        analysis.orientation = this.mapOrientation(parsed.orientation);
+                    catch (parseError) {
+                        this.console.warn(`[Discovery] Failed to parse LLM response for ${cameraName}:`, parseError);
+                        analysis.error = 'Failed to parse LLM response';
+                        return analysis;
                     }
-                    analysis.isValid = true;
-                    this.console.log(`[Discovery] Analyzed ${cameraName}: ${analysis.landmarks.length} landmarks, ${analysis.zones.length} zones`);
                 }
-                catch (parseError) {
-                    this.console.warn(`[Discovery] Failed to parse LLM response for ${cameraName}:`, parseError);
-                    analysis.error = 'Failed to parse LLM response';
+            }
+            catch (e) {
+                lastError = e;
+                // Check if this is a vision/multimodal format error
+                if ((0, spatial_reasoning_1.isVisionNotSupportedError)(e)) {
+                    this.console.warn(`[Discovery] ${formatType} format not supported, trying fallback...`);
+                    continue; // Try next format
                 }
+                // Not a format error - don't retry
+                this.console.warn(`[Discovery] Scene analysis failed for ${cameraName}:`, e);
+                break;
             }
         }
-        catch (e) {
-            this.console.warn(`[Discovery] Scene analysis failed for ${cameraName}:`, e);
-            analysis.error = `Analysis failed: ${e}`;
+        // All formats failed
+        if (lastError) {
+            const errorStr = String(lastError);
+            if ((0, spatial_reasoning_1.isVisionNotSupportedError)(lastError)) {
+                analysis.error = 'Vision/image analysis not supported by configured LLM. Ensure you have a vision-capable model (e.g., gpt-4o, gpt-4-turbo, claude-3-sonnet) configured.';
+            }
+            else {
+                analysis.error = `Analysis failed: ${errorStr}`;
+            }
         }
         // Cache the analysis
         this.sceneCache.set(cameraId, analysis);