@blueharford/scrypted-spatial-awareness 0.6.9 → 0.6.11

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/plugin.zip CHANGED
Binary file
@@ -35137,54 +35137,23 @@ async function mediaObjectToBase64(mediaObject) {
35137
35137
  }
35138
35138
  /**
35139
35139
  * Build image content block for ChatCompletion API
35140
- * Supports OpenAI, Anthropic, and @scrypted/llm formats
35140
+ *
35141
+ * IMPORTANT: @scrypted/llm uses OpenAI-compatible format for ALL providers.
35142
+ * The plugin internally converts this format to the appropriate provider format.
35143
+ * So we ALWAYS use the OpenAI image_url format with data URI.
35144
+ *
35141
35145
  * @param imageData - Image data with base64 and media type
35142
- * @param provider - The LLM provider type
35146
+ * @param provider - The LLM provider type (currently unused, kept for logging)
35143
35147
  */
35144
35148
  function buildImageContent(imageData, provider = 'unknown') {
35145
- if (provider === 'openai') {
35146
- // OpenAI format: uses data URL with image_url wrapper
35147
- return {
35148
- type: 'image_url',
35149
- image_url: {
35150
- url: `data:${imageData.mediaType};base64,${imageData.base64}`,
35151
- detail: 'auto',
35152
- },
35153
- };
35154
- }
35155
- else if (provider === 'anthropic') {
35156
- // Anthropic official format: uses 'data' key
35157
- return {
35158
- type: 'image',
35159
- source: {
35160
- type: 'base64',
35161
- media_type: imageData.mediaType,
35162
- data: imageData.base64,
35163
- },
35164
- };
35165
- }
35166
- else if (provider === 'scrypted') {
35167
- // @scrypted/llm format: uses 'base64' key (per error path .image.source.base64)
35168
- return {
35169
- type: 'image',
35170
- source: {
35171
- type: 'base64',
35172
- media_type: imageData.mediaType,
35173
- base64: imageData.base64,
35174
- },
35175
- };
35176
- }
35177
- else {
35178
- // Unknown provider: try @scrypted/llm format first
35179
- return {
35180
- type: 'image',
35181
- source: {
35182
- type: 'base64',
35183
- media_type: imageData.mediaType,
35184
- base64: imageData.base64,
35185
- },
35186
- };
35187
- }
35149
+ // @scrypted/llm uses OpenAI-compatible format for ALL providers
35150
+ // The plugin handles internal conversion to Anthropic/other formats
35151
+ return {
35152
+ type: 'image_url',
35153
+ image_url: {
35154
+ url: `data:${imageData.mediaType};base64,${imageData.base64}`,
35155
+ },
35156
+ };
35188
35157
  }
35189
35158
  /** Check if an error indicates vision/multimodal content format issue (should try alternate format) */
35190
35159
  function isVisionFormatError(error) {
@@ -35905,32 +35874,56 @@ Examples of good descriptions:
35905
35874
  - "Landscaper with leaf blower heading to work truck"
35906
35875
 
35907
35876
  Generate ONLY the description, nothing else:`;
35908
- // Build message content - use multimodal format if we have an image
35909
- let messageContent;
35877
+ // Try multimodal format first, fall back to text-only if it fails
35878
+ let result;
35879
+ let usedVision = false;
35910
35880
  if (imageData) {
35911
- messageContent = [
35912
- { type: 'text', text: prompt },
35913
- buildImageContent(imageData, this.llmProviderType),
35914
- ];
35881
+ // First attempt: Try multimodal with image
35882
+ try {
35883
+ this.console.log(`[LLM] Attempting multimodal ${eventType} call with image...`);
35884
+ const multimodalContent = [
35885
+ { type: 'text', text: prompt },
35886
+ buildImageContent(imageData, this.llmProviderType),
35887
+ ];
35888
+ result = await llm.getChatCompletion({
35889
+ messages: [
35890
+ {
35891
+ role: 'user',
35892
+ content: multimodalContent,
35893
+ },
35894
+ ],
35895
+ max_tokens: 100,
35896
+ temperature: 0.7,
35897
+ });
35898
+ usedVision = true;
35899
+ }
35900
+ catch (visionError) {
35901
+ // If vision format fails, try text-only
35902
+ if (isVisionFormatError(visionError)) {
35903
+ this.console.warn(`[LLM] Vision format not supported, falling back to text-only: ${visionError.message || visionError}`);
35904
+ }
35905
+ else {
35906
+ this.console.warn(`[LLM] Multimodal call failed, trying text-only: ${visionError.message || visionError}`);
35907
+ }
35908
+ }
35915
35909
  }
35916
- else {
35917
- messageContent = prompt;
35910
+ // If no result yet, try text-only
35911
+ if (!result) {
35912
+ this.console.log(`[LLM] Calling text-only getChatCompletion for ${eventType}...`);
35913
+ result = await llm.getChatCompletion({
35914
+ messages: [
35915
+ {
35916
+ role: 'user',
35917
+ content: prompt,
35918
+ },
35919
+ ],
35920
+ max_tokens: 100,
35921
+ temperature: 0.7,
35922
+ });
35918
35923
  }
35919
- // Call LLM using ChatCompletion interface
35920
- this.console.log(`[LLM] Calling getChatCompletion for ${eventType}...`);
35921
- const result = await llm.getChatCompletion({
35922
- messages: [
35923
- {
35924
- role: 'user',
35925
- content: messageContent,
35926
- },
35927
- ],
35928
- max_tokens: 100,
35929
- temperature: 0.7,
35930
- });
35931
35924
  const content = result?.choices?.[0]?.message?.content;
35932
35925
  if (content && typeof content === 'string') {
35933
- this.console.log(`[LLM] Got ${eventType} description: ${content.trim().substring(0, 50)}...`);
35926
+ this.console.log(`[LLM] Got ${eventType} description (vision=${usedVision}): ${content.trim().substring(0, 50)}...`);
35934
35927
  return content.trim();
35935
35928
  }
35936
35929
  this.console.warn(`[LLM] No content in response for ${eventType}`);