@blueharford/scrypted-spatial-awareness 0.5.3 → 0.5.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/plugin.zip CHANGED
Binary file
@@ -35095,6 +35095,7 @@ Object.defineProperty(exports, "__esModule", ({ value: true }));
35095
35095
  exports.SpatialReasoningEngine = void 0;
35096
35096
  exports.mediaObjectToBase64 = mediaObjectToBase64;
35097
35097
  exports.buildImageContent = buildImageContent;
35098
+ exports.isVisionFormatError = isVisionFormatError;
35098
35099
  const sdk_1 = __importStar(__webpack_require__(/*! @scrypted/sdk */ "./node_modules/@scrypted/sdk/dist/src/index.js"));
35099
35100
  const topology_1 = __webpack_require__(/*! ../models/topology */ "./src/models/topology.ts");
35100
35101
  const { systemManager, mediaManager } = sdk_1.default;
@@ -35107,10 +35108,20 @@ async function mediaObjectToBase64(mediaObject) {
35107
35108
  try {
35108
35109
  // Convert MediaObject to Buffer using mediaManager
35109
35110
  const buffer = await mediaManager.convertMediaObjectToBuffer(mediaObject, sdk_1.ScryptedMimeTypes.Image);
35111
+ if (!buffer || buffer.length === 0) {
35112
+ console.warn('Failed to convert MediaObject: empty buffer');
35113
+ return null;
35114
+ }
35110
35115
  // Convert buffer to base64 (raw, no data URL prefix)
35111
35116
  const base64 = buffer.toString('base64');
35117
+ // Validate base64 - check it's not empty and looks valid
35118
+ if (!base64 || base64.length < 100) {
35119
+ console.warn(`Invalid base64: length=${base64?.length || 0}`);
35120
+ return null;
35121
+ }
35112
35122
  // Determine MIME type - default to JPEG for camera images
35113
35123
  const mediaType = mediaObject.mimeType?.split(';')[0] || 'image/jpeg';
35124
+ console.log(`[Image] Converted to base64: ${base64.length} chars, type=${mediaType}`);
35114
35125
  return { base64, mediaType };
35115
35126
  }
35116
35127
  catch (e) {
@@ -35120,9 +35131,9 @@ async function mediaObjectToBase64(mediaObject) {
35120
35131
  }
35121
35132
  /**
35122
35133
  * Build image content block for ChatCompletion API
35123
- * Supports both OpenAI and Anthropic formats
35134
+ * Supports OpenAI, Anthropic, and @scrypted/llm formats
35124
35135
  * @param imageData - Image data with base64 and media type
35125
- * @param provider - The LLM provider type (openai, anthropic, or unknown)
35136
+ * @param provider - The LLM provider type
35126
35137
  */
35127
35138
  function buildImageContent(imageData, provider = 'unknown') {
35128
35139
  if (provider === 'openai') {
@@ -35131,11 +35142,12 @@ function buildImageContent(imageData, provider = 'unknown') {
35131
35142
  type: 'image_url',
35132
35143
  image_url: {
35133
35144
  url: `data:${imageData.mediaType};base64,${imageData.base64}`,
35145
+ detail: 'auto',
35134
35146
  },
35135
35147
  };
35136
35148
  }
35137
35149
  else if (provider === 'anthropic') {
35138
- // Anthropic format: uses separate base64 data and media_type
35150
+ // Anthropic official format: uses 'data' key
35139
35151
  return {
35140
35152
  type: 'image',
35141
35153
  source: {
@@ -35145,17 +35157,43 @@ function buildImageContent(imageData, provider = 'unknown') {
35145
35157
  },
35146
35158
  };
35147
35159
  }
35160
+ else if (provider === 'scrypted') {
35161
+ // @scrypted/llm format: uses 'base64' key (per error path .image.source.base64)
35162
+ return {
35163
+ type: 'image',
35164
+ source: {
35165
+ type: 'base64',
35166
+ media_type: imageData.mediaType,
35167
+ base64: imageData.base64,
35168
+ },
35169
+ };
35170
+ }
35148
35171
  else {
35149
- // Unknown provider: try OpenAI format as it's more commonly supported
35150
- // Most LLM wrappers (including @scrypted/llm) understand the OpenAI format
35172
+ // Unknown provider: try @scrypted/llm format first
35151
35173
  return {
35152
- type: 'image_url',
35153
- image_url: {
35154
- url: `data:${imageData.mediaType};base64,${imageData.base64}`,
35174
+ type: 'image',
35175
+ source: {
35176
+ type: 'base64',
35177
+ media_type: imageData.mediaType,
35178
+ base64: imageData.base64,
35155
35179
  },
35156
35180
  };
35157
35181
  }
35158
35182
  }
35183
+ /** Check if an error indicates vision/multimodal content format issue (should try alternate format) */
35184
+ function isVisionFormatError(error) {
35185
+ const errorStr = String(error);
35186
+ return (errorStr.includes('content.str') ||
35187
+ errorStr.includes('should be a valid string') ||
35188
+ errorStr.includes('Invalid content type') ||
35189
+ errorStr.includes('does not support vision') ||
35190
+ errorStr.includes('invalid base64') ||
35191
+ errorStr.includes('Invalid base64') ||
35192
+ errorStr.includes('.image.source') ||
35193
+ errorStr.includes('.image_url') ||
35194
+ (errorStr.includes('image_url') && errorStr.includes('not supported')) ||
35195
+ (errorStr.includes('400') && errorStr.includes('content')));
35196
+ }
35159
35197
  class SpatialReasoningEngine {
35160
35198
  config;
35161
35199
  console;
@@ -36186,72 +36224,120 @@ class TopologyDiscoveryEngine {
36186
36224
  analysis.error = 'Failed to capture camera snapshot';
36187
36225
  return analysis;
36188
36226
  }
36189
- try {
36190
- // Build multimodal message with provider-specific image format
36191
- const result = await llm.getChatCompletion({
36192
- messages: [
36193
- {
36194
- role: 'user',
36195
- content: [
36196
- { type: 'text', text: SCENE_ANALYSIS_PROMPT },
36197
- (0, spatial_reasoning_1.buildImageContent)(imageData, this.llmProviderType),
36198
- ],
36199
- },
36200
- ],
36201
- max_tokens: 500,
36202
- temperature: 0.3,
36203
- });
36204
- const content = result?.choices?.[0]?.message?.content;
36205
- if (content && typeof content === 'string') {
36206
- try {
36207
- // Extract JSON from response (handle markdown code blocks)
36208
- let jsonStr = content.trim();
36209
- if (jsonStr.startsWith('```')) {
36210
- jsonStr = jsonStr.replace(/```json?\n?/g, '').replace(/```$/g, '').trim();
36211
- }
36212
- const parsed = JSON.parse(jsonStr);
36213
- // Map parsed data to our types
36214
- if (Array.isArray(parsed.landmarks)) {
36215
- analysis.landmarks = parsed.landmarks.map((l) => ({
36216
- name: l.name || 'Unknown',
36217
- type: this.mapLandmarkType(l.type),
36218
- confidence: typeof l.confidence === 'number' ? l.confidence : 0.7,
36219
- description: l.description || '',
36220
- boundingBox: l.boundingBox,
36221
- }));
36222
- }
36223
- if (Array.isArray(parsed.zones)) {
36224
- analysis.zones = parsed.zones.map((z) => ({
36225
- name: z.name || 'Unknown',
36226
- type: this.mapZoneType(z.type),
36227
- coverage: typeof z.coverage === 'number' ? z.coverage : 0.5,
36228
- description: z.description || '',
36229
- boundingBox: z.boundingBox,
36230
- }));
36231
- }
36232
- if (parsed.edges && typeof parsed.edges === 'object') {
36233
- analysis.edges = {
36234
- top: parsed.edges.top || '',
36235
- left: parsed.edges.left || '',
36236
- right: parsed.edges.right || '',
36237
- bottom: parsed.edges.bottom || '',
36238
- };
36227
+ // Try with detected provider format first, then fallback to alternates
36228
+ // The order matters: try the most likely formats first
36229
+ const formatsToTry = [];
36230
+ // Start with detected format
36231
+ formatsToTry.push(this.llmProviderType);
36232
+ // Add fallbacks based on detected provider
36233
+ if (this.llmProviderType === 'openai') {
36234
+ formatsToTry.push('scrypted', 'anthropic');
36235
+ }
36236
+ else if (this.llmProviderType === 'anthropic') {
36237
+ formatsToTry.push('scrypted', 'openai');
36238
+ }
36239
+ else if (this.llmProviderType === 'scrypted') {
36240
+ formatsToTry.push('anthropic', 'openai');
36241
+ }
36242
+ else {
36243
+ // Unknown - try all formats
36244
+ formatsToTry.push('scrypted', 'anthropic', 'openai');
36245
+ }
36246
+ let lastError = null;
36247
+ for (const formatType of formatsToTry) {
36248
+ try {
36249
+ this.console.log(`[Discovery] Trying ${formatType} image format for ${cameraName}...`);
36250
+ // Build multimodal message with provider-specific image format
36251
+ const result = await llm.getChatCompletion({
36252
+ messages: [
36253
+ {
36254
+ role: 'user',
36255
+ content: [
36256
+ { type: 'text', text: SCENE_ANALYSIS_PROMPT },
36257
+ (0, spatial_reasoning_1.buildImageContent)(imageData, formatType),
36258
+ ],
36259
+ },
36260
+ ],
36261
+ max_tokens: 500,
36262
+ temperature: 0.3,
36263
+ });
36264
+ const content = result?.choices?.[0]?.message?.content;
36265
+ if (content && typeof content === 'string') {
36266
+ try {
36267
+ // Extract JSON from response (handle markdown code blocks)
36268
+ let jsonStr = content.trim();
36269
+ if (jsonStr.startsWith('```')) {
36270
+ jsonStr = jsonStr.replace(/```json?\n?/g, '').replace(/```$/g, '').trim();
36271
+ }
36272
+ const parsed = JSON.parse(jsonStr);
36273
+ // Map parsed data to our types
36274
+ if (Array.isArray(parsed.landmarks)) {
36275
+ analysis.landmarks = parsed.landmarks.map((l) => ({
36276
+ name: l.name || 'Unknown',
36277
+ type: this.mapLandmarkType(l.type),
36278
+ confidence: typeof l.confidence === 'number' ? l.confidence : 0.7,
36279
+ description: l.description || '',
36280
+ boundingBox: l.boundingBox,
36281
+ }));
36282
+ }
36283
+ if (Array.isArray(parsed.zones)) {
36284
+ analysis.zones = parsed.zones.map((z) => ({
36285
+ name: z.name || 'Unknown',
36286
+ type: this.mapZoneType(z.type),
36287
+ coverage: typeof z.coverage === 'number' ? z.coverage : 0.5,
36288
+ description: z.description || '',
36289
+ boundingBox: z.boundingBox,
36290
+ }));
36291
+ }
36292
+ if (parsed.edges && typeof parsed.edges === 'object') {
36293
+ analysis.edges = {
36294
+ top: parsed.edges.top || '',
36295
+ left: parsed.edges.left || '',
36296
+ right: parsed.edges.right || '',
36297
+ bottom: parsed.edges.bottom || '',
36298
+ };
36299
+ }
36300
+ if (parsed.orientation) {
36301
+ analysis.orientation = this.mapOrientation(parsed.orientation);
36302
+ }
36303
+ analysis.isValid = true;
36304
+ this.console.log(`[Discovery] Analyzed ${cameraName}: ${analysis.landmarks.length} landmarks, ${analysis.zones.length} zones (using ${formatType} format)`);
36305
+ // Update the preferred format for future requests
36306
+ if (formatType !== this.llmProviderType) {
36307
+ this.console.log(`[Discovery] Switching to ${formatType} format for future requests`);
36308
+ this.llmProviderType = formatType;
36309
+ }
36310
+ // Success - exit the retry loop
36311
+ return analysis;
36239
36312
  }
36240
- if (parsed.orientation) {
36241
- analysis.orientation = this.mapOrientation(parsed.orientation);
36313
+ catch (parseError) {
36314
+ this.console.warn(`[Discovery] Failed to parse LLM response for ${cameraName}:`, parseError);
36315
+ analysis.error = 'Failed to parse LLM response';
36316
+ return analysis;
36242
36317
  }
36243
- analysis.isValid = true;
36244
- this.console.log(`[Discovery] Analyzed ${cameraName}: ${analysis.landmarks.length} landmarks, ${analysis.zones.length} zones`);
36245
36318
  }
36246
- catch (parseError) {
36247
- this.console.warn(`[Discovery] Failed to parse LLM response for ${cameraName}:`, parseError);
36248
- analysis.error = 'Failed to parse LLM response';
36319
+ }
36320
+ catch (e) {
36321
+ lastError = e;
36322
+ // Check if this is a vision/multimodal format error
36323
+ if ((0, spatial_reasoning_1.isVisionFormatError)(e)) {
36324
+ this.console.warn(`[Discovery] ${formatType} format failed, trying fallback...`);
36325
+ continue; // Try next format
36249
36326
  }
36327
+ // Not a format error - don't retry
36328
+ this.console.warn(`[Discovery] Scene analysis failed for ${cameraName}:`, e);
36329
+ break;
36250
36330
  }
36251
36331
  }
36252
- catch (e) {
36253
- this.console.warn(`[Discovery] Scene analysis failed for ${cameraName}:`, e);
36254
- analysis.error = `Analysis failed: ${e}`;
36332
+ // All formats failed
36333
+ if (lastError) {
36334
+ const errorStr = String(lastError);
36335
+ if ((0, spatial_reasoning_1.isVisionFormatError)(lastError)) {
36336
+ analysis.error = 'Vision/image analysis failed with all formats. Ensure you have a vision-capable model (e.g., gpt-4o, gpt-4-turbo, claude-3-sonnet) configured and the @scrypted/llm plugin supports vision.';
36337
+ }
36338
+ else {
36339
+ analysis.error = `Analysis failed: ${errorStr}`;
36340
+ }
36255
36341
  }
36256
36342
  // Cache the analysis
36257
36343
  this.sceneCache.set(cameraId, analysis);