@blueharford/scrypted-spatial-awareness 0.5.2 → 0.5.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/main.nodejs.js +1 -1
- package/dist/main.nodejs.js.map +1 -1
- package/dist/plugin.zip +0 -0
- package/out/main.nodejs.js +76 -22
- package/out/main.nodejs.js.map +1 -1
- package/out/plugin.zip +0 -0
- package/package.json +1 -1
- package/src/core/spatial-reasoning.ts +58 -20
- package/src/core/topology-discovery.ts +20 -3
package/out/plugin.zip
CHANGED
|
Binary file
|
package/package.json
CHANGED
|
@@ -100,21 +100,44 @@ export async function mediaObjectToBase64(mediaObject: MediaObject): Promise<Ima
|
|
|
100
100
|
}
|
|
101
101
|
}
|
|
102
102
|
|
|
103
|
+
/** LLM Provider type for image format selection */
|
|
104
|
+
export type LlmProvider = 'openai' | 'anthropic' | 'unknown';
|
|
105
|
+
|
|
103
106
|
/**
|
|
104
107
|
* Build image content block for ChatCompletion API
|
|
105
|
-
*
|
|
108
|
+
* Supports both OpenAI and Anthropic formats
|
|
109
|
+
* @param imageData - Image data with base64 and media type
|
|
110
|
+
* @param provider - The LLM provider type (openai, anthropic, or unknown)
|
|
106
111
|
*/
|
|
107
|
-
export function buildImageContent(imageData: ImageData): any {
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
|
|
116
|
-
|
|
117
|
-
|
|
112
|
+
export function buildImageContent(imageData: ImageData, provider: LlmProvider = 'unknown'): any {
|
|
113
|
+
if (provider === 'openai') {
|
|
114
|
+
// OpenAI format: uses data URL with image_url wrapper
|
|
115
|
+
return {
|
|
116
|
+
type: 'image_url',
|
|
117
|
+
image_url: {
|
|
118
|
+
url: `data:${imageData.mediaType};base64,${imageData.base64}`,
|
|
119
|
+
},
|
|
120
|
+
};
|
|
121
|
+
} else if (provider === 'anthropic') {
|
|
122
|
+
// Anthropic format: uses separate base64 data and media_type
|
|
123
|
+
return {
|
|
124
|
+
type: 'image',
|
|
125
|
+
source: {
|
|
126
|
+
type: 'base64',
|
|
127
|
+
media_type: imageData.mediaType,
|
|
128
|
+
data: imageData.base64,
|
|
129
|
+
},
|
|
130
|
+
};
|
|
131
|
+
} else {
|
|
132
|
+
// Unknown provider: try OpenAI format as it's more commonly supported
|
|
133
|
+
// Most LLM wrappers (including @scrypted/llm) understand the OpenAI format
|
|
134
|
+
return {
|
|
135
|
+
type: 'image_url',
|
|
136
|
+
image_url: {
|
|
137
|
+
url: `data:${imageData.mediaType};base64,${imageData.base64}`,
|
|
138
|
+
},
|
|
139
|
+
};
|
|
140
|
+
}
|
|
118
141
|
}
|
|
119
142
|
|
|
120
143
|
export class SpatialReasoningEngine {
|
|
@@ -361,6 +384,7 @@ export class SpatialReasoningEngine {
|
|
|
361
384
|
|
|
362
385
|
private llmSearched: boolean = false;
|
|
363
386
|
private llmProvider: string | null = null;
|
|
387
|
+
private llmProviderType: LlmProvider = 'unknown';
|
|
364
388
|
|
|
365
389
|
/** Find or initialize LLM device - looks for ChatCompletion interface from @scrypted/llm plugin */
|
|
366
390
|
private async findLlmDevice(): Promise<ChatCompletionDevice | null> {
|
|
@@ -381,27 +405,36 @@ export class SpatialReasoningEngine {
|
|
|
381
405
|
const deviceName = device.name?.toLowerCase() || '';
|
|
382
406
|
const pluginId = (device as any).pluginId?.toLowerCase() || '';
|
|
383
407
|
|
|
384
|
-
// Identify the provider type for logging
|
|
408
|
+
// Identify the provider type for logging and image format selection
|
|
385
409
|
let providerType = 'Unknown';
|
|
386
|
-
|
|
387
|
-
|
|
388
|
-
}
|
|
410
|
+
let providerTypeEnum: LlmProvider = 'unknown';
|
|
411
|
+
|
|
389
412
|
if (deviceName.includes('openai') || deviceName.includes('gpt')) {
|
|
390
413
|
providerType = 'OpenAI';
|
|
414
|
+
providerTypeEnum = 'openai';
|
|
391
415
|
} else if (deviceName.includes('anthropic') || deviceName.includes('claude')) {
|
|
392
416
|
providerType = 'Anthropic';
|
|
417
|
+
providerTypeEnum = 'anthropic';
|
|
393
418
|
} else if (deviceName.includes('ollama')) {
|
|
394
419
|
providerType = 'Ollama';
|
|
420
|
+
providerTypeEnum = 'openai'; // Ollama uses OpenAI-compatible format
|
|
395
421
|
} else if (deviceName.includes('gemini') || deviceName.includes('google')) {
|
|
396
422
|
providerType = 'Google';
|
|
423
|
+
providerTypeEnum = 'openai'; // Google uses OpenAI-compatible format
|
|
397
424
|
} else if (deviceName.includes('llama')) {
|
|
398
425
|
providerType = 'llama.cpp';
|
|
426
|
+
providerTypeEnum = 'openai'; // llama.cpp uses OpenAI-compatible format
|
|
427
|
+
} else if (pluginId.includes('@scrypted/llm') || pluginId.includes('llm')) {
|
|
428
|
+
providerType = 'Scrypted LLM';
|
|
429
|
+
providerTypeEnum = 'unknown';
|
|
399
430
|
}
|
|
400
431
|
|
|
401
432
|
this.llmDevice = device as unknown as ChatCompletionDevice;
|
|
402
433
|
this.llmProvider = `${providerType} (${device.name})`;
|
|
434
|
+
this.llmProviderType = providerTypeEnum;
|
|
403
435
|
this.console.log(`[LLM] Connected to ${providerType}: ${device.name}`);
|
|
404
436
|
this.console.log(`[LLM] Plugin: ${pluginId || 'N/A'}`);
|
|
437
|
+
this.console.log(`[LLM] Image format: ${providerTypeEnum}`);
|
|
405
438
|
this.console.log(`[LLM] Interfaces: ${device.interfaces?.join(', ')}`);
|
|
406
439
|
return this.llmDevice;
|
|
407
440
|
}
|
|
@@ -423,6 +456,11 @@ export class SpatialReasoningEngine {
|
|
|
423
456
|
return this.llmProvider;
|
|
424
457
|
}
|
|
425
458
|
|
|
459
|
+
/** Get the current LLM provider type for image format selection */
|
|
460
|
+
getLlmProviderType(): LlmProvider {
|
|
461
|
+
return this.llmProviderType;
|
|
462
|
+
}
|
|
463
|
+
|
|
426
464
|
/** Check if LLM is available */
|
|
427
465
|
isLlmAvailable(): boolean {
|
|
428
466
|
return this.llmDevice !== null;
|
|
@@ -801,10 +839,10 @@ export class SpatialReasoningEngine {
|
|
|
801
839
|
// Build message content - use multimodal format if we have an image
|
|
802
840
|
let messageContent: any;
|
|
803
841
|
if (imageData) {
|
|
804
|
-
// Vision-capable multimodal message format (
|
|
842
|
+
// Vision-capable multimodal message format (provider-specific)
|
|
805
843
|
messageContent = [
|
|
806
844
|
{ type: 'text', text: prompt },
|
|
807
|
-
buildImageContent(imageData),
|
|
845
|
+
buildImageContent(imageData, this.llmProviderType),
|
|
808
846
|
];
|
|
809
847
|
} else {
|
|
810
848
|
// Fallback to text-only if image conversion failed
|
|
@@ -906,10 +944,10 @@ If no clear landmark is identifiable, respond with: {"name": null}`;
|
|
|
906
944
|
// Build message content - use multimodal format if we have an image
|
|
907
945
|
let messageContent: any;
|
|
908
946
|
if (imageData) {
|
|
909
|
-
// Vision-capable multimodal message format (
|
|
947
|
+
// Vision-capable multimodal message format (provider-specific)
|
|
910
948
|
messageContent = [
|
|
911
949
|
{ type: 'text', text: prompt },
|
|
912
|
-
buildImageContent(imageData),
|
|
950
|
+
buildImageContent(imageData, this.llmProviderType),
|
|
913
951
|
];
|
|
914
952
|
} else {
|
|
915
953
|
// Fallback to text-only if image conversion failed
|
|
@@ -30,7 +30,7 @@ import {
|
|
|
30
30
|
Landmark,
|
|
31
31
|
findCamera,
|
|
32
32
|
} from '../models/topology';
|
|
33
|
-
import { mediaObjectToBase64, buildImageContent, ImageData } from './spatial-reasoning';
|
|
33
|
+
import { mediaObjectToBase64, buildImageContent, ImageData, LlmProvider } from './spatial-reasoning';
|
|
34
34
|
|
|
35
35
|
const { systemManager } = sdk;
|
|
36
36
|
|
|
@@ -100,6 +100,7 @@ export class TopologyDiscoveryEngine {
|
|
|
100
100
|
private topology: CameraTopology | null = null;
|
|
101
101
|
private llmDevice: ChatCompletionDevice | null = null;
|
|
102
102
|
private llmSearched: boolean = false;
|
|
103
|
+
private llmProviderType: LlmProvider = 'unknown';
|
|
103
104
|
|
|
104
105
|
// Scene analysis cache (camera ID -> analysis)
|
|
105
106
|
private sceneCache: Map<string, SceneAnalysis> = new Map();
|
|
@@ -177,8 +178,24 @@ export class TopologyDiscoveryEngine {
|
|
|
177
178
|
if (!device) continue;
|
|
178
179
|
|
|
179
180
|
if (device.interfaces?.includes('ChatCompletion')) {
|
|
181
|
+
const deviceName = device.name?.toLowerCase() || '';
|
|
182
|
+
|
|
183
|
+
// Detect provider type for image format selection
|
|
184
|
+
if (deviceName.includes('openai') || deviceName.includes('gpt')) {
|
|
185
|
+
this.llmProviderType = 'openai';
|
|
186
|
+
} else if (deviceName.includes('anthropic') || deviceName.includes('claude')) {
|
|
187
|
+
this.llmProviderType = 'anthropic';
|
|
188
|
+
} else if (deviceName.includes('ollama') || deviceName.includes('gemini') ||
|
|
189
|
+
deviceName.includes('google') || deviceName.includes('llama')) {
|
|
190
|
+
// These providers use OpenAI-compatible format
|
|
191
|
+
this.llmProviderType = 'openai';
|
|
192
|
+
} else {
|
|
193
|
+
this.llmProviderType = 'unknown';
|
|
194
|
+
}
|
|
195
|
+
|
|
180
196
|
this.llmDevice = device as unknown as ChatCompletionDevice;
|
|
181
197
|
this.console.log(`[Discovery] Connected to LLM: ${device.name}`);
|
|
198
|
+
this.console.log(`[Discovery] Image format: ${this.llmProviderType}`);
|
|
182
199
|
return this.llmDevice;
|
|
183
200
|
}
|
|
184
201
|
}
|
|
@@ -237,14 +254,14 @@ export class TopologyDiscoveryEngine {
|
|
|
237
254
|
}
|
|
238
255
|
|
|
239
256
|
try {
|
|
240
|
-
// Build multimodal message with
|
|
257
|
+
// Build multimodal message with provider-specific image format
|
|
241
258
|
const result = await llm.getChatCompletion({
|
|
242
259
|
messages: [
|
|
243
260
|
{
|
|
244
261
|
role: 'user',
|
|
245
262
|
content: [
|
|
246
263
|
{ type: 'text', text: SCENE_ANALYSIS_PROMPT },
|
|
247
|
-
buildImageContent(imageData),
|
|
264
|
+
buildImageContent(imageData, this.llmProviderType),
|
|
248
265
|
],
|
|
249
266
|
},
|
|
250
267
|
],
|