@blueharford/scrypted-spatial-awareness 0.6.9 → 0.6.11
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/main.nodejs.js +1 -1
- package/dist/main.nodejs.js.map +1 -1
- package/dist/plugin.zip +0 -0
- package/out/main.nodejs.js +59 -66
- package/out/main.nodejs.js.map +1 -1
- package/out/plugin.zip +0 -0
- package/package.json +1 -1
- package/src/core/spatial-reasoning.ts +60 -63
package/out/plugin.zip
CHANGED
|
Binary file
|
package/package.json
CHANGED
|
@@ -154,51 +154,23 @@ export type LlmProvider = 'openai' | 'anthropic' | 'scrypted' | 'unknown';
|
|
|
154
154
|
|
|
155
155
|
/**
|
|
156
156
|
* Build image content block for ChatCompletion API
|
|
157
|
-
*
|
|
157
|
+
*
|
|
158
|
+
* IMPORTANT: @scrypted/llm uses OpenAI-compatible format for ALL providers.
|
|
159
|
+
* The plugin internally converts this format to the appropriate provider format.
|
|
160
|
+
* So we ALWAYS use the OpenAI image_url format with data URI.
|
|
161
|
+
*
|
|
158
162
|
* @param imageData - Image data with base64 and media type
|
|
159
|
-
* @param provider - The LLM provider type
|
|
163
|
+
* @param provider - The LLM provider type (currently unused, kept for logging)
|
|
160
164
|
*/
|
|
161
165
|
export function buildImageContent(imageData: ImageData, provider: LlmProvider = 'unknown'): any {
|
|
162
|
-
|
|
163
|
-
|
|
164
|
-
|
|
165
|
-
|
|
166
|
-
|
|
167
|
-
|
|
168
|
-
|
|
169
|
-
|
|
170
|
-
};
|
|
171
|
-
} else if (provider === 'anthropic') {
|
|
172
|
-
// Anthropic official format: uses 'data' key
|
|
173
|
-
return {
|
|
174
|
-
type: 'image',
|
|
175
|
-
source: {
|
|
176
|
-
type: 'base64',
|
|
177
|
-
media_type: imageData.mediaType,
|
|
178
|
-
data: imageData.base64,
|
|
179
|
-
},
|
|
180
|
-
};
|
|
181
|
-
} else if (provider === 'scrypted') {
|
|
182
|
-
// @scrypted/llm format: uses 'base64' key (per error path .image.source.base64)
|
|
183
|
-
return {
|
|
184
|
-
type: 'image',
|
|
185
|
-
source: {
|
|
186
|
-
type: 'base64',
|
|
187
|
-
media_type: imageData.mediaType,
|
|
188
|
-
base64: imageData.base64,
|
|
189
|
-
},
|
|
190
|
-
};
|
|
191
|
-
} else {
|
|
192
|
-
// Unknown provider: try @scrypted/llm format first
|
|
193
|
-
return {
|
|
194
|
-
type: 'image',
|
|
195
|
-
source: {
|
|
196
|
-
type: 'base64',
|
|
197
|
-
media_type: imageData.mediaType,
|
|
198
|
-
base64: imageData.base64,
|
|
199
|
-
},
|
|
200
|
-
};
|
|
201
|
-
}
|
|
166
|
+
// @scrypted/llm uses OpenAI-compatible format for ALL providers
|
|
167
|
+
// The plugin handles internal conversion to Anthropic/other formats
|
|
168
|
+
return {
|
|
169
|
+
type: 'image_url',
|
|
170
|
+
image_url: {
|
|
171
|
+
url: `data:${imageData.mediaType};base64,${imageData.base64}`,
|
|
172
|
+
},
|
|
173
|
+
};
|
|
202
174
|
}
|
|
203
175
|
|
|
204
176
|
/** Check if an error indicates vision/multimodal content format issue (should try alternate format) */
|
|
@@ -1077,33 +1049,58 @@ Examples of good descriptions:
|
|
|
1077
1049
|
|
|
1078
1050
|
Generate ONLY the description, nothing else:`;
|
|
1079
1051
|
|
|
1080
|
-
//
|
|
1081
|
-
let
|
|
1052
|
+
// Try multimodal format first, fall back to text-only if it fails
|
|
1053
|
+
let result: any;
|
|
1054
|
+
let usedVision = false;
|
|
1055
|
+
|
|
1082
1056
|
if (imageData) {
|
|
1083
|
-
|
|
1084
|
-
|
|
1085
|
-
|
|
1086
|
-
|
|
1087
|
-
|
|
1088
|
-
|
|
1057
|
+
// First attempt: Try multimodal with image
|
|
1058
|
+
try {
|
|
1059
|
+
this.console.log(`[LLM] Attempting multimodal ${eventType} call with image...`);
|
|
1060
|
+
const multimodalContent = [
|
|
1061
|
+
{ type: 'text', text: prompt },
|
|
1062
|
+
buildImageContent(imageData, this.llmProviderType),
|
|
1063
|
+
];
|
|
1064
|
+
|
|
1065
|
+
result = await llm.getChatCompletion({
|
|
1066
|
+
messages: [
|
|
1067
|
+
{
|
|
1068
|
+
role: 'user',
|
|
1069
|
+
content: multimodalContent,
|
|
1070
|
+
},
|
|
1071
|
+
],
|
|
1072
|
+
max_tokens: 100,
|
|
1073
|
+
temperature: 0.7,
|
|
1074
|
+
});
|
|
1075
|
+
usedVision = true;
|
|
1076
|
+
} catch (visionError: any) {
|
|
1077
|
+
// If vision format fails, try text-only
|
|
1078
|
+
if (isVisionFormatError(visionError)) {
|
|
1079
|
+
this.console.warn(`[LLM] Vision format not supported, falling back to text-only: ${visionError.message || visionError}`);
|
|
1080
|
+
} else {
|
|
1081
|
+
this.console.warn(`[LLM] Multimodal call failed, trying text-only: ${visionError.message || visionError}`);
|
|
1082
|
+
}
|
|
1083
|
+
}
|
|
1089
1084
|
}
|
|
1090
1085
|
|
|
1091
|
-
//
|
|
1092
|
-
|
|
1093
|
-
|
|
1094
|
-
|
|
1095
|
-
|
|
1096
|
-
|
|
1097
|
-
|
|
1098
|
-
|
|
1099
|
-
|
|
1100
|
-
|
|
1101
|
-
|
|
1102
|
-
|
|
1086
|
+
// If no result yet, try text-only
|
|
1087
|
+
if (!result) {
|
|
1088
|
+
this.console.log(`[LLM] Calling text-only getChatCompletion for ${eventType}...`);
|
|
1089
|
+
result = await llm.getChatCompletion({
|
|
1090
|
+
messages: [
|
|
1091
|
+
{
|
|
1092
|
+
role: 'user',
|
|
1093
|
+
content: prompt,
|
|
1094
|
+
},
|
|
1095
|
+
],
|
|
1096
|
+
max_tokens: 100,
|
|
1097
|
+
temperature: 0.7,
|
|
1098
|
+
});
|
|
1099
|
+
}
|
|
1103
1100
|
|
|
1104
1101
|
const content = result?.choices?.[0]?.message?.content;
|
|
1105
1102
|
if (content && typeof content === 'string') {
|
|
1106
|
-
this.console.log(`[LLM] Got ${eventType} description: ${content.trim().substring(0, 50)}...`);
|
|
1103
|
+
this.console.log(`[LLM] Got ${eventType} description (vision=${usedVision}): ${content.trim().substring(0, 50)}...`);
|
|
1107
1104
|
return content.trim();
|
|
1108
1105
|
}
|
|
1109
1106
|
|