@blueharford/scrypted-spatial-awareness 0.5.2 → 0.5.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/plugin.zip CHANGED
Binary file
@@ -35120,19 +35120,41 @@ async function mediaObjectToBase64(mediaObject) {
35120
35120
  }
35121
35121
  /**
35122
35122
  * Build image content block for ChatCompletion API
35123
- * Compatible with both OpenAI and Anthropic formats via @scrypted/llm
35123
+ * Supports both OpenAI and Anthropic formats
35124
+ * @param imageData - Image data with base64 and media type
35125
+ * @param provider - The LLM provider type (openai, anthropic, or unknown)
35124
35126
  */
35125
- function buildImageContent(imageData) {
35126
- // Use Anthropic's native format which @scrypted/llm should translate
35127
- // This format is more explicit about the base64 data
35128
- return {
35129
- type: 'image',
35130
- source: {
35131
- type: 'base64',
35132
- media_type: imageData.mediaType,
35133
- data: imageData.base64,
35134
- },
35135
- };
35127
+ function buildImageContent(imageData, provider = 'unknown') {
35128
+ if (provider === 'openai') {
35129
+ // OpenAI format: uses data URL with image_url wrapper
35130
+ return {
35131
+ type: 'image_url',
35132
+ image_url: {
35133
+ url: `data:${imageData.mediaType};base64,${imageData.base64}`,
35134
+ },
35135
+ };
35136
+ }
35137
+ else if (provider === 'anthropic') {
35138
+ // Anthropic format: uses separate base64 data and media_type
35139
+ return {
35140
+ type: 'image',
35141
+ source: {
35142
+ type: 'base64',
35143
+ media_type: imageData.mediaType,
35144
+ data: imageData.base64,
35145
+ },
35146
+ };
35147
+ }
35148
+ else {
35149
+ // Unknown provider: try OpenAI format as it's more commonly supported
35150
+ // Most LLM wrappers (including @scrypted/llm) understand the OpenAI format
35151
+ return {
35152
+ type: 'image_url',
35153
+ image_url: {
35154
+ url: `data:${imageData.mediaType};base64,${imageData.base64}`,
35155
+ },
35156
+ };
35157
+ }
35136
35158
  }
35137
35159
  class SpatialReasoningEngine {
35138
35160
  config;
@@ -35353,6 +35375,7 @@ class SpatialReasoningEngine {
35353
35375
  }
35354
35376
  llmSearched = false;
35355
35377
  llmProvider = null;
35378
+ llmProviderType = 'unknown';
35356
35379
  /** Find or initialize LLM device - looks for ChatCompletion interface from @scrypted/llm plugin */
35357
35380
  async findLlmDevice() {
35358
35381
  if (this.llmDevice)
@@ -35371,30 +35394,39 @@ class SpatialReasoningEngine {
35371
35394
  if (device.interfaces?.includes('ChatCompletion')) {
35372
35395
  const deviceName = device.name?.toLowerCase() || '';
35373
35396
  const pluginId = device.pluginId?.toLowerCase() || '';
35374
- // Identify the provider type for logging
35397
+ // Identify the provider type for logging and image format selection
35375
35398
  let providerType = 'Unknown';
35376
- if (pluginId.includes('@scrypted/llm') || pluginId.includes('llm')) {
35377
- providerType = 'Scrypted LLM';
35378
- }
35399
+ let providerTypeEnum = 'unknown';
35379
35400
  if (deviceName.includes('openai') || deviceName.includes('gpt')) {
35380
35401
  providerType = 'OpenAI';
35402
+ providerTypeEnum = 'openai';
35381
35403
  }
35382
35404
  else if (deviceName.includes('anthropic') || deviceName.includes('claude')) {
35383
35405
  providerType = 'Anthropic';
35406
+ providerTypeEnum = 'anthropic';
35384
35407
  }
35385
35408
  else if (deviceName.includes('ollama')) {
35386
35409
  providerType = 'Ollama';
35410
+ providerTypeEnum = 'openai'; // Ollama uses OpenAI-compatible format
35387
35411
  }
35388
35412
  else if (deviceName.includes('gemini') || deviceName.includes('google')) {
35389
35413
  providerType = 'Google';
35414
+ providerTypeEnum = 'openai'; // Google uses OpenAI-compatible format
35390
35415
  }
35391
35416
  else if (deviceName.includes('llama')) {
35392
35417
  providerType = 'llama.cpp';
35418
+ providerTypeEnum = 'openai'; // llama.cpp uses OpenAI-compatible format
35419
+ }
35420
+ else if (pluginId.includes('@scrypted/llm') || pluginId.includes('llm')) {
35421
+ providerType = 'Scrypted LLM';
35422
+ providerTypeEnum = 'unknown';
35393
35423
  }
35394
35424
  this.llmDevice = device;
35395
35425
  this.llmProvider = `${providerType} (${device.name})`;
35426
+ this.llmProviderType = providerTypeEnum;
35396
35427
  this.console.log(`[LLM] Connected to ${providerType}: ${device.name}`);
35397
35428
  this.console.log(`[LLM] Plugin: ${pluginId || 'N/A'}`);
35429
+ this.console.log(`[LLM] Image format: ${providerTypeEnum}`);
35398
35430
  this.console.log(`[LLM] Interfaces: ${device.interfaces?.join(', ')}`);
35399
35431
  return this.llmDevice;
35400
35432
  }
@@ -35412,6 +35444,10 @@ class SpatialReasoningEngine {
35412
35444
  getLlmProvider() {
35413
35445
  return this.llmProvider;
35414
35446
  }
35447
+ /** Get the current LLM provider type for image format selection */
35448
+ getLlmProviderType() {
35449
+ return this.llmProviderType;
35450
+ }
35415
35451
  /** Check if LLM is available */
35416
35452
  isLlmAvailable() {
35417
35453
  return this.llmDevice !== null;
@@ -35688,10 +35724,10 @@ class SpatialReasoningEngine {
35688
35724
  // Build message content - use multimodal format if we have an image
35689
35725
  let messageContent;
35690
35726
  if (imageData) {
35691
- // Vision-capable multimodal message format (Anthropic native format)
35727
+ // Vision-capable multimodal message format (provider-specific)
35692
35728
  messageContent = [
35693
35729
  { type: 'text', text: prompt },
35694
- buildImageContent(imageData),
35730
+ buildImageContent(imageData, this.llmProviderType),
35695
35731
  ];
35696
35732
  }
35697
35733
  else {
@@ -35774,10 +35810,10 @@ If no clear landmark is identifiable, respond with: {"name": null}`;
35774
35810
  // Build message content - use multimodal format if we have an image
35775
35811
  let messageContent;
35776
35812
  if (imageData) {
35777
- // Vision-capable multimodal message format (Anthropic native format)
35813
+ // Vision-capable multimodal message format (provider-specific)
35778
35814
  messageContent = [
35779
35815
  { type: 'text', text: prompt },
35780
- buildImageContent(imageData),
35816
+ buildImageContent(imageData, this.llmProviderType),
35781
35817
  ];
35782
35818
  }
35783
35819
  else {
@@ -36017,6 +36053,7 @@ class TopologyDiscoveryEngine {
36017
36053
  topology = null;
36018
36054
  llmDevice = null;
36019
36055
  llmSearched = false;
36056
+ llmProviderType = 'unknown';
36020
36057
  // Scene analysis cache (camera ID -> analysis)
36021
36058
  sceneCache = new Map();
36022
36059
  // Pending suggestions for user review
@@ -36080,8 +36117,25 @@ class TopologyDiscoveryEngine {
36080
36117
  if (!device)
36081
36118
  continue;
36082
36119
  if (device.interfaces?.includes('ChatCompletion')) {
36120
+ const deviceName = device.name?.toLowerCase() || '';
36121
+ // Detect provider type for image format selection
36122
+ if (deviceName.includes('openai') || deviceName.includes('gpt')) {
36123
+ this.llmProviderType = 'openai';
36124
+ }
36125
+ else if (deviceName.includes('anthropic') || deviceName.includes('claude')) {
36126
+ this.llmProviderType = 'anthropic';
36127
+ }
36128
+ else if (deviceName.includes('ollama') || deviceName.includes('gemini') ||
36129
+ deviceName.includes('google') || deviceName.includes('llama')) {
36130
+ // These providers use OpenAI-compatible format
36131
+ this.llmProviderType = 'openai';
36132
+ }
36133
+ else {
36134
+ this.llmProviderType = 'unknown';
36135
+ }
36083
36136
  this.llmDevice = device;
36084
36137
  this.console.log(`[Discovery] Connected to LLM: ${device.name}`);
36138
+ this.console.log(`[Discovery] Image format: ${this.llmProviderType}`);
36085
36139
  return this.llmDevice;
36086
36140
  }
36087
36141
  }
@@ -36133,14 +36187,14 @@ class TopologyDiscoveryEngine {
36133
36187
  return analysis;
36134
36188
  }
36135
36189
  try {
36136
- // Build multimodal message with Anthropic-native format
36190
+ // Build multimodal message with provider-specific image format
36137
36191
  const result = await llm.getChatCompletion({
36138
36192
  messages: [
36139
36193
  {
36140
36194
  role: 'user',
36141
36195
  content: [
36142
36196
  { type: 'text', text: SCENE_ANALYSIS_PROMPT },
36143
- (0, spatial_reasoning_1.buildImageContent)(imageData),
36197
+ (0, spatial_reasoning_1.buildImageContent)(imageData, this.llmProviderType),
36144
36198
  ],
36145
36199
  },
36146
36200
  ],