@blueharford/scrypted-spatial-awareness 0.6.9 → 0.6.10
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/main.nodejs.js +1 -1
- package/dist/main.nodejs.js.map +1 -1
- package/dist/plugin.zip +0 -0
- package/out/main.nodejs.js +45 -21
- package/out/main.nodejs.js.map +1 -1
- package/out/plugin.zip +0 -0
- package/package.json +1 -1
- package/src/core/spatial-reasoning.ts +46 -21
package/dist/plugin.zip
CHANGED
|
Binary file
|
package/out/main.nodejs.js
CHANGED
|
@@ -35905,32 +35905,56 @@ Examples of good descriptions:
|
|
|
35905
35905
|
- "Landscaper with leaf blower heading to work truck"
|
|
35906
35906
|
|
|
35907
35907
|
Generate ONLY the description, nothing else:`;
|
|
35908
|
-
//
|
|
35909
|
-
let
|
|
35908
|
+
// Try multimodal format first, fall back to text-only if it fails
|
|
35909
|
+
let result;
|
|
35910
|
+
let usedVision = false;
|
|
35910
35911
|
if (imageData) {
|
|
35911
|
-
|
|
35912
|
-
|
|
35913
|
-
|
|
35914
|
-
|
|
35912
|
+
// First attempt: Try multimodal with image
|
|
35913
|
+
try {
|
|
35914
|
+
this.console.log(`[LLM] Attempting multimodal ${eventType} call with image...`);
|
|
35915
|
+
const multimodalContent = [
|
|
35916
|
+
{ type: 'text', text: prompt },
|
|
35917
|
+
buildImageContent(imageData, this.llmProviderType),
|
|
35918
|
+
];
|
|
35919
|
+
result = await llm.getChatCompletion({
|
|
35920
|
+
messages: [
|
|
35921
|
+
{
|
|
35922
|
+
role: 'user',
|
|
35923
|
+
content: multimodalContent,
|
|
35924
|
+
},
|
|
35925
|
+
],
|
|
35926
|
+
max_tokens: 100,
|
|
35927
|
+
temperature: 0.7,
|
|
35928
|
+
});
|
|
35929
|
+
usedVision = true;
|
|
35930
|
+
}
|
|
35931
|
+
catch (visionError) {
|
|
35932
|
+
// If vision format fails, try text-only
|
|
35933
|
+
if (isVisionFormatError(visionError)) {
|
|
35934
|
+
this.console.warn(`[LLM] Vision format not supported, falling back to text-only: ${visionError.message || visionError}`);
|
|
35935
|
+
}
|
|
35936
|
+
else {
|
|
35937
|
+
this.console.warn(`[LLM] Multimodal call failed, trying text-only: ${visionError.message || visionError}`);
|
|
35938
|
+
}
|
|
35939
|
+
}
|
|
35915
35940
|
}
|
|
35916
|
-
|
|
35917
|
-
|
|
35941
|
+
// If no result yet, try text-only
|
|
35942
|
+
if (!result) {
|
|
35943
|
+
this.console.log(`[LLM] Calling text-only getChatCompletion for ${eventType}...`);
|
|
35944
|
+
result = await llm.getChatCompletion({
|
|
35945
|
+
messages: [
|
|
35946
|
+
{
|
|
35947
|
+
role: 'user',
|
|
35948
|
+
content: prompt,
|
|
35949
|
+
},
|
|
35950
|
+
],
|
|
35951
|
+
max_tokens: 100,
|
|
35952
|
+
temperature: 0.7,
|
|
35953
|
+
});
|
|
35918
35954
|
}
|
|
35919
|
-
// Call LLM using ChatCompletion interface
|
|
35920
|
-
this.console.log(`[LLM] Calling getChatCompletion for ${eventType}...`);
|
|
35921
|
-
const result = await llm.getChatCompletion({
|
|
35922
|
-
messages: [
|
|
35923
|
-
{
|
|
35924
|
-
role: 'user',
|
|
35925
|
-
content: messageContent,
|
|
35926
|
-
},
|
|
35927
|
-
],
|
|
35928
|
-
max_tokens: 100,
|
|
35929
|
-
temperature: 0.7,
|
|
35930
|
-
});
|
|
35931
35955
|
const content = result?.choices?.[0]?.message?.content;
|
|
35932
35956
|
if (content && typeof content === 'string') {
|
|
35933
|
-
this.console.log(`[LLM] Got ${eventType} description: ${content.trim().substring(0, 50)}...`);
|
|
35957
|
+
this.console.log(`[LLM] Got ${eventType} description (vision=${usedVision}): ${content.trim().substring(0, 50)}...`);
|
|
35934
35958
|
return content.trim();
|
|
35935
35959
|
}
|
|
35936
35960
|
this.console.warn(`[LLM] No content in response for ${eventType}`);
|