@blueharford/scrypted-spatial-awareness 0.6.9 → 0.6.11
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/main.nodejs.js +1 -1
- package/dist/main.nodejs.js.map +1 -1
- package/dist/plugin.zip +0 -0
- package/out/main.nodejs.js +59 -66
- package/out/main.nodejs.js.map +1 -1
- package/out/plugin.zip +0 -0
- package/package.json +1 -1
- package/src/core/spatial-reasoning.ts +60 -63
package/dist/plugin.zip
CHANGED
|
Binary file
|
package/out/main.nodejs.js
CHANGED
|
@@ -35137,54 +35137,23 @@ async function mediaObjectToBase64(mediaObject) {
|
|
|
35137
35137
|
}
|
|
35138
35138
|
/**
|
|
35139
35139
|
* Build image content block for ChatCompletion API
|
|
35140
|
-
*
|
|
35140
|
+
*
|
|
35141
|
+
* IMPORTANT: @scrypted/llm uses OpenAI-compatible format for ALL providers.
|
|
35142
|
+
* The plugin internally converts this format to the appropriate provider format.
|
|
35143
|
+
* So we ALWAYS use the OpenAI image_url format with data URI.
|
|
35144
|
+
*
|
|
35141
35145
|
* @param imageData - Image data with base64 and media type
|
|
35142
|
-
* @param provider - The LLM provider type
|
|
35146
|
+
* @param provider - The LLM provider type (currently unused, kept for logging)
|
|
35143
35147
|
*/
|
|
35144
35148
|
function buildImageContent(imageData, provider = 'unknown') {
|
|
35145
|
-
|
|
35146
|
-
|
|
35147
|
-
|
|
35148
|
-
|
|
35149
|
-
|
|
35150
|
-
|
|
35151
|
-
|
|
35152
|
-
|
|
35153
|
-
};
|
|
35154
|
-
}
|
|
35155
|
-
else if (provider === 'anthropic') {
|
|
35156
|
-
// Anthropic official format: uses 'data' key
|
|
35157
|
-
return {
|
|
35158
|
-
type: 'image',
|
|
35159
|
-
source: {
|
|
35160
|
-
type: 'base64',
|
|
35161
|
-
media_type: imageData.mediaType,
|
|
35162
|
-
data: imageData.base64,
|
|
35163
|
-
},
|
|
35164
|
-
};
|
|
35165
|
-
}
|
|
35166
|
-
else if (provider === 'scrypted') {
|
|
35167
|
-
// @scrypted/llm format: uses 'base64' key (per error path .image.source.base64)
|
|
35168
|
-
return {
|
|
35169
|
-
type: 'image',
|
|
35170
|
-
source: {
|
|
35171
|
-
type: 'base64',
|
|
35172
|
-
media_type: imageData.mediaType,
|
|
35173
|
-
base64: imageData.base64,
|
|
35174
|
-
},
|
|
35175
|
-
};
|
|
35176
|
-
}
|
|
35177
|
-
else {
|
|
35178
|
-
// Unknown provider: try @scrypted/llm format first
|
|
35179
|
-
return {
|
|
35180
|
-
type: 'image',
|
|
35181
|
-
source: {
|
|
35182
|
-
type: 'base64',
|
|
35183
|
-
media_type: imageData.mediaType,
|
|
35184
|
-
base64: imageData.base64,
|
|
35185
|
-
},
|
|
35186
|
-
};
|
|
35187
|
-
}
|
|
35149
|
+
// @scrypted/llm uses OpenAI-compatible format for ALL providers
|
|
35150
|
+
// The plugin handles internal conversion to Anthropic/other formats
|
|
35151
|
+
return {
|
|
35152
|
+
type: 'image_url',
|
|
35153
|
+
image_url: {
|
|
35154
|
+
url: `data:${imageData.mediaType};base64,${imageData.base64}`,
|
|
35155
|
+
},
|
|
35156
|
+
};
|
|
35188
35157
|
}
|
|
35189
35158
|
/** Check if an error indicates vision/multimodal content format issue (should try alternate format) */
|
|
35190
35159
|
function isVisionFormatError(error) {
|
|
@@ -35905,32 +35874,56 @@ Examples of good descriptions:
|
|
|
35905
35874
|
- "Landscaper with leaf blower heading to work truck"
|
|
35906
35875
|
|
|
35907
35876
|
Generate ONLY the description, nothing else:`;
|
|
35908
|
-
//
|
|
35909
|
-
let
|
|
35877
|
+
// Try multimodal format first, fall back to text-only if it fails
|
|
35878
|
+
let result;
|
|
35879
|
+
let usedVision = false;
|
|
35910
35880
|
if (imageData) {
|
|
35911
|
-
|
|
35912
|
-
|
|
35913
|
-
|
|
35914
|
-
|
|
35881
|
+
// First attempt: Try multimodal with image
|
|
35882
|
+
try {
|
|
35883
|
+
this.console.log(`[LLM] Attempting multimodal ${eventType} call with image...`);
|
|
35884
|
+
const multimodalContent = [
|
|
35885
|
+
{ type: 'text', text: prompt },
|
|
35886
|
+
buildImageContent(imageData, this.llmProviderType),
|
|
35887
|
+
];
|
|
35888
|
+
result = await llm.getChatCompletion({
|
|
35889
|
+
messages: [
|
|
35890
|
+
{
|
|
35891
|
+
role: 'user',
|
|
35892
|
+
content: multimodalContent,
|
|
35893
|
+
},
|
|
35894
|
+
],
|
|
35895
|
+
max_tokens: 100,
|
|
35896
|
+
temperature: 0.7,
|
|
35897
|
+
});
|
|
35898
|
+
usedVision = true;
|
|
35899
|
+
}
|
|
35900
|
+
catch (visionError) {
|
|
35901
|
+
// If vision format fails, try text-only
|
|
35902
|
+
if (isVisionFormatError(visionError)) {
|
|
35903
|
+
this.console.warn(`[LLM] Vision format not supported, falling back to text-only: ${visionError.message || visionError}`);
|
|
35904
|
+
}
|
|
35905
|
+
else {
|
|
35906
|
+
this.console.warn(`[LLM] Multimodal call failed, trying text-only: ${visionError.message || visionError}`);
|
|
35907
|
+
}
|
|
35908
|
+
}
|
|
35915
35909
|
}
|
|
35916
|
-
|
|
35917
|
-
|
|
35910
|
+
// If no result yet, try text-only
|
|
35911
|
+
if (!result) {
|
|
35912
|
+
this.console.log(`[LLM] Calling text-only getChatCompletion for ${eventType}...`);
|
|
35913
|
+
result = await llm.getChatCompletion({
|
|
35914
|
+
messages: [
|
|
35915
|
+
{
|
|
35916
|
+
role: 'user',
|
|
35917
|
+
content: prompt,
|
|
35918
|
+
},
|
|
35919
|
+
],
|
|
35920
|
+
max_tokens: 100,
|
|
35921
|
+
temperature: 0.7,
|
|
35922
|
+
});
|
|
35918
35923
|
}
|
|
35919
|
-
// Call LLM using ChatCompletion interface
|
|
35920
|
-
this.console.log(`[LLM] Calling getChatCompletion for ${eventType}...`);
|
|
35921
|
-
const result = await llm.getChatCompletion({
|
|
35922
|
-
messages: [
|
|
35923
|
-
{
|
|
35924
|
-
role: 'user',
|
|
35925
|
-
content: messageContent,
|
|
35926
|
-
},
|
|
35927
|
-
],
|
|
35928
|
-
max_tokens: 100,
|
|
35929
|
-
temperature: 0.7,
|
|
35930
|
-
});
|
|
35931
35924
|
const content = result?.choices?.[0]?.message?.content;
|
|
35932
35925
|
if (content && typeof content === 'string') {
|
|
35933
|
-
this.console.log(`[LLM] Got ${eventType} description: ${content.trim().substring(0, 50)}...`);
|
|
35926
|
+
this.console.log(`[LLM] Got ${eventType} description (vision=${usedVision}): ${content.trim().substring(0, 50)}...`);
|
|
35934
35927
|
return content.trim();
|
|
35935
35928
|
}
|
|
35936
35929
|
this.console.warn(`[LLM] No content in response for ${eventType}`);
|