@blueharford/scrypted-spatial-awareness 0.5.3 → 0.5.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/plugin.zip CHANGED
Binary file
@@ -35095,6 +35095,7 @@ Object.defineProperty(exports, "__esModule", ({ value: true }));
35095
35095
  exports.SpatialReasoningEngine = void 0;
35096
35096
  exports.mediaObjectToBase64 = mediaObjectToBase64;
35097
35097
  exports.buildImageContent = buildImageContent;
35098
+ exports.isVisionNotSupportedError = isVisionNotSupportedError;
35098
35099
  const sdk_1 = __importStar(__webpack_require__(/*! @scrypted/sdk */ "./node_modules/@scrypted/sdk/dist/src/index.js"));
35099
35100
  const topology_1 = __webpack_require__(/*! ../models/topology */ "./src/models/topology.ts");
35100
35101
  const { systemManager, mediaManager } = sdk_1.default;
@@ -35127,10 +35128,12 @@ async function mediaObjectToBase64(mediaObject) {
35127
35128
  function buildImageContent(imageData, provider = 'unknown') {
35128
35129
  if (provider === 'openai') {
35129
35130
  // OpenAI format: uses data URL with image_url wrapper
35131
+ // Include detail parameter for compatibility
35130
35132
  return {
35131
35133
  type: 'image_url',
35132
35134
  image_url: {
35133
35135
  url: `data:${imageData.mediaType};base64,${imageData.base64}`,
35136
+ detail: 'auto',
35134
35137
  },
35135
35138
  };
35136
35139
  }
@@ -35146,16 +35149,27 @@ function buildImageContent(imageData, provider = 'unknown') {
35146
35149
  };
35147
35150
  }
35148
35151
  else {
35149
- // Unknown provider: try OpenAI format as it's more commonly supported
35150
- // Most LLM wrappers (including @scrypted/llm) understand the OpenAI format
35152
+ // Unknown provider: try Anthropic format first as it's more explicit
35153
+ // Some plugins may translate this to OpenAI format internally
35151
35154
  return {
35152
- type: 'image_url',
35153
- image_url: {
35154
- url: `data:${imageData.mediaType};base64,${imageData.base64}`,
35155
+ type: 'image',
35156
+ source: {
35157
+ type: 'base64',
35158
+ media_type: imageData.mediaType,
35159
+ data: imageData.base64,
35155
35160
  },
35156
35161
  };
35157
35162
  }
35158
35163
  }
35164
+ /** Check if an error indicates vision/multimodal content is not supported */
35165
+ function isVisionNotSupportedError(error) {
35166
+ const errorStr = String(error);
35167
+ return (errorStr.includes('content.str') ||
35168
+ errorStr.includes('should be a valid string') ||
35169
+ errorStr.includes('Invalid content type') ||
35170
+ errorStr.includes('does not support vision') ||
35171
+ errorStr.includes('image_url') && errorStr.includes('not supported'));
35172
+ }
35159
35173
  class SpatialReasoningEngine {
35160
35174
  config;
35161
35175
  console;
@@ -36186,72 +36200,114 @@ class TopologyDiscoveryEngine {
36186
36200
  analysis.error = 'Failed to capture camera snapshot';
36187
36201
  return analysis;
36188
36202
  }
36189
- try {
36190
- // Build multimodal message with provider-specific image format
36191
- const result = await llm.getChatCompletion({
36192
- messages: [
36193
- {
36194
- role: 'user',
36195
- content: [
36196
- { type: 'text', text: SCENE_ANALYSIS_PROMPT },
36197
- (0, spatial_reasoning_1.buildImageContent)(imageData, this.llmProviderType),
36198
- ],
36199
- },
36200
- ],
36201
- max_tokens: 500,
36202
- temperature: 0.3,
36203
- });
36204
- const content = result?.choices?.[0]?.message?.content;
36205
- if (content && typeof content === 'string') {
36206
- try {
36207
- // Extract JSON from response (handle markdown code blocks)
36208
- let jsonStr = content.trim();
36209
- if (jsonStr.startsWith('```')) {
36210
- jsonStr = jsonStr.replace(/```json?\n?/g, '').replace(/```$/g, '').trim();
36211
- }
36212
- const parsed = JSON.parse(jsonStr);
36213
- // Map parsed data to our types
36214
- if (Array.isArray(parsed.landmarks)) {
36215
- analysis.landmarks = parsed.landmarks.map((l) => ({
36216
- name: l.name || 'Unknown',
36217
- type: this.mapLandmarkType(l.type),
36218
- confidence: typeof l.confidence === 'number' ? l.confidence : 0.7,
36219
- description: l.description || '',
36220
- boundingBox: l.boundingBox,
36221
- }));
36222
- }
36223
- if (Array.isArray(parsed.zones)) {
36224
- analysis.zones = parsed.zones.map((z) => ({
36225
- name: z.name || 'Unknown',
36226
- type: this.mapZoneType(z.type),
36227
- coverage: typeof z.coverage === 'number' ? z.coverage : 0.5,
36228
- description: z.description || '',
36229
- boundingBox: z.boundingBox,
36230
- }));
36231
- }
36232
- if (parsed.edges && typeof parsed.edges === 'object') {
36233
- analysis.edges = {
36234
- top: parsed.edges.top || '',
36235
- left: parsed.edges.left || '',
36236
- right: parsed.edges.right || '',
36237
- bottom: parsed.edges.bottom || '',
36238
- };
36203
+ // Try with detected provider format first, then fallback to alternate format
36204
+ const formatsToTry = [this.llmProviderType];
36205
+ // Add fallback format
36206
+ if (this.llmProviderType === 'openai') {
36207
+ formatsToTry.push('anthropic');
36208
+ }
36209
+ else if (this.llmProviderType === 'anthropic') {
36210
+ formatsToTry.push('openai');
36211
+ }
36212
+ else {
36213
+ // Unknown - try both
36214
+ formatsToTry.push('openai');
36215
+ }
36216
+ let lastError = null;
36217
+ for (const formatType of formatsToTry) {
36218
+ try {
36219
+ this.console.log(`[Discovery] Trying ${formatType} image format for ${cameraName}...`);
36220
+ // Build multimodal message with provider-specific image format
36221
+ const result = await llm.getChatCompletion({
36222
+ messages: [
36223
+ {
36224
+ role: 'user',
36225
+ content: [
36226
+ { type: 'text', text: SCENE_ANALYSIS_PROMPT },
36227
+ (0, spatial_reasoning_1.buildImageContent)(imageData, formatType),
36228
+ ],
36229
+ },
36230
+ ],
36231
+ max_tokens: 500,
36232
+ temperature: 0.3,
36233
+ });
36234
+ const content = result?.choices?.[0]?.message?.content;
36235
+ if (content && typeof content === 'string') {
36236
+ try {
36237
+ // Extract JSON from response (handle markdown code blocks)
36238
+ let jsonStr = content.trim();
36239
+ if (jsonStr.startsWith('```')) {
36240
+ jsonStr = jsonStr.replace(/```json?\n?/g, '').replace(/```$/g, '').trim();
36241
+ }
36242
+ const parsed = JSON.parse(jsonStr);
36243
+ // Map parsed data to our types
36244
+ if (Array.isArray(parsed.landmarks)) {
36245
+ analysis.landmarks = parsed.landmarks.map((l) => ({
36246
+ name: l.name || 'Unknown',
36247
+ type: this.mapLandmarkType(l.type),
36248
+ confidence: typeof l.confidence === 'number' ? l.confidence : 0.7,
36249
+ description: l.description || '',
36250
+ boundingBox: l.boundingBox,
36251
+ }));
36252
+ }
36253
+ if (Array.isArray(parsed.zones)) {
36254
+ analysis.zones = parsed.zones.map((z) => ({
36255
+ name: z.name || 'Unknown',
36256
+ type: this.mapZoneType(z.type),
36257
+ coverage: typeof z.coverage === 'number' ? z.coverage : 0.5,
36258
+ description: z.description || '',
36259
+ boundingBox: z.boundingBox,
36260
+ }));
36261
+ }
36262
+ if (parsed.edges && typeof parsed.edges === 'object') {
36263
+ analysis.edges = {
36264
+ top: parsed.edges.top || '',
36265
+ left: parsed.edges.left || '',
36266
+ right: parsed.edges.right || '',
36267
+ bottom: parsed.edges.bottom || '',
36268
+ };
36269
+ }
36270
+ if (parsed.orientation) {
36271
+ analysis.orientation = this.mapOrientation(parsed.orientation);
36272
+ }
36273
+ analysis.isValid = true;
36274
+ this.console.log(`[Discovery] Analyzed ${cameraName}: ${analysis.landmarks.length} landmarks, ${analysis.zones.length} zones (using ${formatType} format)`);
36275
+ // Update the preferred format for future requests
36276
+ if (formatType !== this.llmProviderType) {
36277
+ this.console.log(`[Discovery] Switching to ${formatType} format for future requests`);
36278
+ this.llmProviderType = formatType;
36279
+ }
36280
+ // Success - exit the retry loop
36281
+ return analysis;
36239
36282
  }
36240
- if (parsed.orientation) {
36241
- analysis.orientation = this.mapOrientation(parsed.orientation);
36283
+ catch (parseError) {
36284
+ this.console.warn(`[Discovery] Failed to parse LLM response for ${cameraName}:`, parseError);
36285
+ analysis.error = 'Failed to parse LLM response';
36286
+ return analysis;
36242
36287
  }
36243
- analysis.isValid = true;
36244
- this.console.log(`[Discovery] Analyzed ${cameraName}: ${analysis.landmarks.length} landmarks, ${analysis.zones.length} zones`);
36245
36288
  }
36246
- catch (parseError) {
36247
- this.console.warn(`[Discovery] Failed to parse LLM response for ${cameraName}:`, parseError);
36248
- analysis.error = 'Failed to parse LLM response';
36289
+ }
36290
+ catch (e) {
36291
+ lastError = e;
36292
+ // Check if this is a vision/multimodal format error
36293
+ if ((0, spatial_reasoning_1.isVisionNotSupportedError)(e)) {
36294
+ this.console.warn(`[Discovery] ${formatType} format not supported, trying fallback...`);
36295
+ continue; // Try next format
36249
36296
  }
36297
+ // Not a format error - don't retry
36298
+ this.console.warn(`[Discovery] Scene analysis failed for ${cameraName}:`, e);
36299
+ break;
36250
36300
  }
36251
36301
  }
36252
- catch (e) {
36253
- this.console.warn(`[Discovery] Scene analysis failed for ${cameraName}:`, e);
36254
- analysis.error = `Analysis failed: ${e}`;
36302
+ // All formats failed
36303
+ if (lastError) {
36304
+ const errorStr = String(lastError);
36305
+ if ((0, spatial_reasoning_1.isVisionNotSupportedError)(lastError)) {
36306
+ analysis.error = 'Vision/image analysis not supported by configured LLM. Ensure you have a vision-capable model (e.g., gpt-4o, gpt-4-turbo, claude-3-sonnet) configured.';
36307
+ }
36308
+ else {
36309
+ analysis.error = `Analysis failed: ${errorStr}`;
36310
+ }
36255
36311
  }
36256
36312
  // Cache the analysis
36257
36313
  this.sceneCache.set(cameraId, analysis);