@blueharford/scrypted-spatial-awareness 0.5.2 → 0.5.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/main.nodejs.js +1 -1
- package/dist/main.nodejs.js.map +1 -1
- package/dist/plugin.zip +0 -0
- package/out/main.nodejs.js +190 -80
- package/out/main.nodejs.js.map +1 -1
- package/out/plugin.zip +0 -0
- package/package.json +1 -1
- package/src/core/spatial-reasoning.ts +74 -20
- package/src/core/topology-discovery.ts +131 -66
package/out/plugin.zip
CHANGED
|
Binary file
|
package/package.json
CHANGED
|
@@ -100,21 +100,60 @@ export async function mediaObjectToBase64(mediaObject: MediaObject): Promise<Ima
|
|
|
100
100
|
}
|
|
101
101
|
}
|
|
102
102
|
|
|
103
|
+
/** LLM Provider type for image format selection */
|
|
104
|
+
export type LlmProvider = 'openai' | 'anthropic' | 'unknown';
|
|
105
|
+
|
|
103
106
|
/**
|
|
104
107
|
* Build image content block for ChatCompletion API
|
|
105
|
-
*
|
|
108
|
+
* Supports both OpenAI and Anthropic formats
|
|
109
|
+
* @param imageData - Image data with base64 and media type
|
|
110
|
+
* @param provider - The LLM provider type (openai, anthropic, or unknown)
|
|
106
111
|
*/
|
|
107
|
-
export function buildImageContent(imageData: ImageData): any {
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
|
|
116
|
-
|
|
117
|
-
|
|
112
|
+
export function buildImageContent(imageData: ImageData, provider: LlmProvider = 'unknown'): any {
|
|
113
|
+
if (provider === 'openai') {
|
|
114
|
+
// OpenAI format: uses data URL with image_url wrapper
|
|
115
|
+
// Include detail parameter for compatibility
|
|
116
|
+
return {
|
|
117
|
+
type: 'image_url',
|
|
118
|
+
image_url: {
|
|
119
|
+
url: `data:${imageData.mediaType};base64,${imageData.base64}`,
|
|
120
|
+
detail: 'auto',
|
|
121
|
+
},
|
|
122
|
+
};
|
|
123
|
+
} else if (provider === 'anthropic') {
|
|
124
|
+
// Anthropic format: uses separate base64 data and media_type
|
|
125
|
+
return {
|
|
126
|
+
type: 'image',
|
|
127
|
+
source: {
|
|
128
|
+
type: 'base64',
|
|
129
|
+
media_type: imageData.mediaType,
|
|
130
|
+
data: imageData.base64,
|
|
131
|
+
},
|
|
132
|
+
};
|
|
133
|
+
} else {
|
|
134
|
+
// Unknown provider: try Anthropic format first as it's more explicit
|
|
135
|
+
// Some plugins may translate this to OpenAI format internally
|
|
136
|
+
return {
|
|
137
|
+
type: 'image',
|
|
138
|
+
source: {
|
|
139
|
+
type: 'base64',
|
|
140
|
+
media_type: imageData.mediaType,
|
|
141
|
+
data: imageData.base64,
|
|
142
|
+
},
|
|
143
|
+
};
|
|
144
|
+
}
|
|
145
|
+
}
|
|
146
|
+
|
|
147
|
+
/** Check if an error indicates vision/multimodal content is not supported */
|
|
148
|
+
export function isVisionNotSupportedError(error: any): boolean {
|
|
149
|
+
const errorStr = String(error);
|
|
150
|
+
return (
|
|
151
|
+
errorStr.includes('content.str') ||
|
|
152
|
+
errorStr.includes('should be a valid string') ||
|
|
153
|
+
errorStr.includes('Invalid content type') ||
|
|
154
|
+
errorStr.includes('does not support vision') ||
|
|
155
|
+
errorStr.includes('image_url') && errorStr.includes('not supported')
|
|
156
|
+
);
|
|
118
157
|
}
|
|
119
158
|
|
|
120
159
|
export class SpatialReasoningEngine {
|
|
@@ -361,6 +400,7 @@ export class SpatialReasoningEngine {
|
|
|
361
400
|
|
|
362
401
|
private llmSearched: boolean = false;
|
|
363
402
|
private llmProvider: string | null = null;
|
|
403
|
+
private llmProviderType: LlmProvider = 'unknown';
|
|
364
404
|
|
|
365
405
|
/** Find or initialize LLM device - looks for ChatCompletion interface from @scrypted/llm plugin */
|
|
366
406
|
private async findLlmDevice(): Promise<ChatCompletionDevice | null> {
|
|
@@ -381,27 +421,36 @@ export class SpatialReasoningEngine {
|
|
|
381
421
|
const deviceName = device.name?.toLowerCase() || '';
|
|
382
422
|
const pluginId = (device as any).pluginId?.toLowerCase() || '';
|
|
383
423
|
|
|
384
|
-
// Identify the provider type for logging
|
|
424
|
+
// Identify the provider type for logging and image format selection
|
|
385
425
|
let providerType = 'Unknown';
|
|
386
|
-
|
|
387
|
-
|
|
388
|
-
}
|
|
426
|
+
let providerTypeEnum: LlmProvider = 'unknown';
|
|
427
|
+
|
|
389
428
|
if (deviceName.includes('openai') || deviceName.includes('gpt')) {
|
|
390
429
|
providerType = 'OpenAI';
|
|
430
|
+
providerTypeEnum = 'openai';
|
|
391
431
|
} else if (deviceName.includes('anthropic') || deviceName.includes('claude')) {
|
|
392
432
|
providerType = 'Anthropic';
|
|
433
|
+
providerTypeEnum = 'anthropic';
|
|
393
434
|
} else if (deviceName.includes('ollama')) {
|
|
394
435
|
providerType = 'Ollama';
|
|
436
|
+
providerTypeEnum = 'openai'; // Ollama uses OpenAI-compatible format
|
|
395
437
|
} else if (deviceName.includes('gemini') || deviceName.includes('google')) {
|
|
396
438
|
providerType = 'Google';
|
|
439
|
+
providerTypeEnum = 'openai'; // Google uses OpenAI-compatible format
|
|
397
440
|
} else if (deviceName.includes('llama')) {
|
|
398
441
|
providerType = 'llama.cpp';
|
|
442
|
+
providerTypeEnum = 'openai'; // llama.cpp uses OpenAI-compatible format
|
|
443
|
+
} else if (pluginId.includes('@scrypted/llm') || pluginId.includes('llm')) {
|
|
444
|
+
providerType = 'Scrypted LLM';
|
|
445
|
+
providerTypeEnum = 'unknown';
|
|
399
446
|
}
|
|
400
447
|
|
|
401
448
|
this.llmDevice = device as unknown as ChatCompletionDevice;
|
|
402
449
|
this.llmProvider = `${providerType} (${device.name})`;
|
|
450
|
+
this.llmProviderType = providerTypeEnum;
|
|
403
451
|
this.console.log(`[LLM] Connected to ${providerType}: ${device.name}`);
|
|
404
452
|
this.console.log(`[LLM] Plugin: ${pluginId || 'N/A'}`);
|
|
453
|
+
this.console.log(`[LLM] Image format: ${providerTypeEnum}`);
|
|
405
454
|
this.console.log(`[LLM] Interfaces: ${device.interfaces?.join(', ')}`);
|
|
406
455
|
return this.llmDevice;
|
|
407
456
|
}
|
|
@@ -423,6 +472,11 @@ export class SpatialReasoningEngine {
|
|
|
423
472
|
return this.llmProvider;
|
|
424
473
|
}
|
|
425
474
|
|
|
475
|
+
/** Get the current LLM provider type for image format selection */
|
|
476
|
+
getLlmProviderType(): LlmProvider {
|
|
477
|
+
return this.llmProviderType;
|
|
478
|
+
}
|
|
479
|
+
|
|
426
480
|
/** Check if LLM is available */
|
|
427
481
|
isLlmAvailable(): boolean {
|
|
428
482
|
return this.llmDevice !== null;
|
|
@@ -801,10 +855,10 @@ export class SpatialReasoningEngine {
|
|
|
801
855
|
// Build message content - use multimodal format if we have an image
|
|
802
856
|
let messageContent: any;
|
|
803
857
|
if (imageData) {
|
|
804
|
-
// Vision-capable multimodal message format (
|
|
858
|
+
// Vision-capable multimodal message format (provider-specific)
|
|
805
859
|
messageContent = [
|
|
806
860
|
{ type: 'text', text: prompt },
|
|
807
|
-
buildImageContent(imageData),
|
|
861
|
+
buildImageContent(imageData, this.llmProviderType),
|
|
808
862
|
];
|
|
809
863
|
} else {
|
|
810
864
|
// Fallback to text-only if image conversion failed
|
|
@@ -906,10 +960,10 @@ If no clear landmark is identifiable, respond with: {"name": null}`;
|
|
|
906
960
|
// Build message content - use multimodal format if we have an image
|
|
907
961
|
let messageContent: any;
|
|
908
962
|
if (imageData) {
|
|
909
|
-
// Vision-capable multimodal message format (
|
|
963
|
+
// Vision-capable multimodal message format (provider-specific)
|
|
910
964
|
messageContent = [
|
|
911
965
|
{ type: 'text', text: prompt },
|
|
912
|
-
buildImageContent(imageData),
|
|
966
|
+
buildImageContent(imageData, this.llmProviderType),
|
|
913
967
|
];
|
|
914
968
|
} else {
|
|
915
969
|
// Fallback to text-only if image conversion failed
|
|
@@ -30,7 +30,7 @@ import {
|
|
|
30
30
|
Landmark,
|
|
31
31
|
findCamera,
|
|
32
32
|
} from '../models/topology';
|
|
33
|
-
import { mediaObjectToBase64, buildImageContent, ImageData } from './spatial-reasoning';
|
|
33
|
+
import { mediaObjectToBase64, buildImageContent, ImageData, LlmProvider, isVisionNotSupportedError } from './spatial-reasoning';
|
|
34
34
|
|
|
35
35
|
const { systemManager } = sdk;
|
|
36
36
|
|
|
@@ -100,6 +100,7 @@ export class TopologyDiscoveryEngine {
|
|
|
100
100
|
private topology: CameraTopology | null = null;
|
|
101
101
|
private llmDevice: ChatCompletionDevice | null = null;
|
|
102
102
|
private llmSearched: boolean = false;
|
|
103
|
+
private llmProviderType: LlmProvider = 'unknown';
|
|
103
104
|
|
|
104
105
|
// Scene analysis cache (camera ID -> analysis)
|
|
105
106
|
private sceneCache: Map<string, SceneAnalysis> = new Map();
|
|
@@ -177,8 +178,24 @@ export class TopologyDiscoveryEngine {
|
|
|
177
178
|
if (!device) continue;
|
|
178
179
|
|
|
179
180
|
if (device.interfaces?.includes('ChatCompletion')) {
|
|
181
|
+
const deviceName = device.name?.toLowerCase() || '';
|
|
182
|
+
|
|
183
|
+
// Detect provider type for image format selection
|
|
184
|
+
if (deviceName.includes('openai') || deviceName.includes('gpt')) {
|
|
185
|
+
this.llmProviderType = 'openai';
|
|
186
|
+
} else if (deviceName.includes('anthropic') || deviceName.includes('claude')) {
|
|
187
|
+
this.llmProviderType = 'anthropic';
|
|
188
|
+
} else if (deviceName.includes('ollama') || deviceName.includes('gemini') ||
|
|
189
|
+
deviceName.includes('google') || deviceName.includes('llama')) {
|
|
190
|
+
// These providers use OpenAI-compatible format
|
|
191
|
+
this.llmProviderType = 'openai';
|
|
192
|
+
} else {
|
|
193
|
+
this.llmProviderType = 'unknown';
|
|
194
|
+
}
|
|
195
|
+
|
|
180
196
|
this.llmDevice = device as unknown as ChatCompletionDevice;
|
|
181
197
|
this.console.log(`[Discovery] Connected to LLM: ${device.name}`);
|
|
198
|
+
this.console.log(`[Discovery] Image format: ${this.llmProviderType}`);
|
|
182
199
|
return this.llmDevice;
|
|
183
200
|
}
|
|
184
201
|
}
|
|
@@ -236,77 +253,125 @@ export class TopologyDiscoveryEngine {
|
|
|
236
253
|
return analysis;
|
|
237
254
|
}
|
|
238
255
|
|
|
239
|
-
|
|
240
|
-
|
|
241
|
-
|
|
242
|
-
|
|
243
|
-
|
|
244
|
-
|
|
245
|
-
|
|
246
|
-
|
|
247
|
-
|
|
248
|
-
|
|
249
|
-
|
|
250
|
-
|
|
251
|
-
max_tokens: 500,
|
|
252
|
-
temperature: 0.3,
|
|
253
|
-
});
|
|
254
|
-
|
|
255
|
-
const content = result?.choices?.[0]?.message?.content;
|
|
256
|
-
if (content && typeof content === 'string') {
|
|
257
|
-
try {
|
|
258
|
-
// Extract JSON from response (handle markdown code blocks)
|
|
259
|
-
let jsonStr = content.trim();
|
|
260
|
-
if (jsonStr.startsWith('```')) {
|
|
261
|
-
jsonStr = jsonStr.replace(/```json?\n?/g, '').replace(/```$/g, '').trim();
|
|
262
|
-
}
|
|
263
|
-
|
|
264
|
-
const parsed = JSON.parse(jsonStr);
|
|
265
|
-
|
|
266
|
-
// Map parsed data to our types
|
|
267
|
-
if (Array.isArray(parsed.landmarks)) {
|
|
268
|
-
analysis.landmarks = parsed.landmarks.map((l: any) => ({
|
|
269
|
-
name: l.name || 'Unknown',
|
|
270
|
-
type: this.mapLandmarkType(l.type),
|
|
271
|
-
confidence: typeof l.confidence === 'number' ? l.confidence : 0.7,
|
|
272
|
-
description: l.description || '',
|
|
273
|
-
boundingBox: l.boundingBox,
|
|
274
|
-
}));
|
|
275
|
-
}
|
|
256
|
+
// Try with detected provider format first, then fallback to alternate format
|
|
257
|
+
const formatsToTry: LlmProvider[] = [this.llmProviderType];
|
|
258
|
+
|
|
259
|
+
// Add fallback format
|
|
260
|
+
if (this.llmProviderType === 'openai') {
|
|
261
|
+
formatsToTry.push('anthropic');
|
|
262
|
+
} else if (this.llmProviderType === 'anthropic') {
|
|
263
|
+
formatsToTry.push('openai');
|
|
264
|
+
} else {
|
|
265
|
+
// Unknown - try both
|
|
266
|
+
formatsToTry.push('openai');
|
|
267
|
+
}
|
|
276
268
|
|
|
277
|
-
|
|
278
|
-
|
|
279
|
-
|
|
280
|
-
|
|
281
|
-
|
|
282
|
-
|
|
283
|
-
|
|
284
|
-
|
|
269
|
+
let lastError: any = null;
|
|
270
|
+
|
|
271
|
+
for (const formatType of formatsToTry) {
|
|
272
|
+
try {
|
|
273
|
+
this.console.log(`[Discovery] Trying ${formatType} image format for ${cameraName}...`);
|
|
274
|
+
|
|
275
|
+
// Build multimodal message with provider-specific image format
|
|
276
|
+
const result = await llm.getChatCompletion({
|
|
277
|
+
messages: [
|
|
278
|
+
{
|
|
279
|
+
role: 'user',
|
|
280
|
+
content: [
|
|
281
|
+
{ type: 'text', text: SCENE_ANALYSIS_PROMPT },
|
|
282
|
+
buildImageContent(imageData, formatType),
|
|
283
|
+
],
|
|
284
|
+
},
|
|
285
|
+
],
|
|
286
|
+
max_tokens: 500,
|
|
287
|
+
temperature: 0.3,
|
|
288
|
+
});
|
|
289
|
+
|
|
290
|
+
const content = result?.choices?.[0]?.message?.content;
|
|
291
|
+
if (content && typeof content === 'string') {
|
|
292
|
+
try {
|
|
293
|
+
// Extract JSON from response (handle markdown code blocks)
|
|
294
|
+
let jsonStr = content.trim();
|
|
295
|
+
if (jsonStr.startsWith('```')) {
|
|
296
|
+
jsonStr = jsonStr.replace(/```json?\n?/g, '').replace(/```$/g, '').trim();
|
|
297
|
+
}
|
|
298
|
+
|
|
299
|
+
const parsed = JSON.parse(jsonStr);
|
|
300
|
+
|
|
301
|
+
// Map parsed data to our types
|
|
302
|
+
if (Array.isArray(parsed.landmarks)) {
|
|
303
|
+
analysis.landmarks = parsed.landmarks.map((l: any) => ({
|
|
304
|
+
name: l.name || 'Unknown',
|
|
305
|
+
type: this.mapLandmarkType(l.type),
|
|
306
|
+
confidence: typeof l.confidence === 'number' ? l.confidence : 0.7,
|
|
307
|
+
description: l.description || '',
|
|
308
|
+
boundingBox: l.boundingBox,
|
|
309
|
+
}));
|
|
310
|
+
}
|
|
311
|
+
|
|
312
|
+
if (Array.isArray(parsed.zones)) {
|
|
313
|
+
analysis.zones = parsed.zones.map((z: any) => ({
|
|
314
|
+
name: z.name || 'Unknown',
|
|
315
|
+
type: this.mapZoneType(z.type),
|
|
316
|
+
coverage: typeof z.coverage === 'number' ? z.coverage : 0.5,
|
|
317
|
+
description: z.description || '',
|
|
318
|
+
boundingBox: z.boundingBox,
|
|
319
|
+
}));
|
|
320
|
+
}
|
|
321
|
+
|
|
322
|
+
if (parsed.edges && typeof parsed.edges === 'object') {
|
|
323
|
+
analysis.edges = {
|
|
324
|
+
top: parsed.edges.top || '',
|
|
325
|
+
left: parsed.edges.left || '',
|
|
326
|
+
right: parsed.edges.right || '',
|
|
327
|
+
bottom: parsed.edges.bottom || '',
|
|
328
|
+
};
|
|
329
|
+
}
|
|
330
|
+
|
|
331
|
+
if (parsed.orientation) {
|
|
332
|
+
analysis.orientation = this.mapOrientation(parsed.orientation);
|
|
333
|
+
}
|
|
334
|
+
|
|
335
|
+
analysis.isValid = true;
|
|
336
|
+
this.console.log(`[Discovery] Analyzed ${cameraName}: ${analysis.landmarks.length} landmarks, ${analysis.zones.length} zones (using ${formatType} format)`);
|
|
337
|
+
|
|
338
|
+
// Update the preferred format for future requests
|
|
339
|
+
if (formatType !== this.llmProviderType) {
|
|
340
|
+
this.console.log(`[Discovery] Switching to ${formatType} format for future requests`);
|
|
341
|
+
this.llmProviderType = formatType;
|
|
342
|
+
}
|
|
343
|
+
|
|
344
|
+
// Success - exit the retry loop
|
|
345
|
+
return analysis;
|
|
346
|
+
} catch (parseError) {
|
|
347
|
+
this.console.warn(`[Discovery] Failed to parse LLM response for ${cameraName}:`, parseError);
|
|
348
|
+
analysis.error = 'Failed to parse LLM response';
|
|
349
|
+
return analysis;
|
|
285
350
|
}
|
|
351
|
+
}
|
|
352
|
+
} catch (e) {
|
|
353
|
+
lastError = e;
|
|
286
354
|
|
|
287
|
-
|
|
288
|
-
|
|
289
|
-
|
|
290
|
-
|
|
291
|
-
|
|
292
|
-
bottom: parsed.edges.bottom || '',
|
|
293
|
-
};
|
|
294
|
-
}
|
|
355
|
+
// Check if this is a vision/multimodal format error
|
|
356
|
+
if (isVisionNotSupportedError(e)) {
|
|
357
|
+
this.console.warn(`[Discovery] ${formatType} format not supported, trying fallback...`);
|
|
358
|
+
continue; // Try next format
|
|
359
|
+
}
|
|
295
360
|
|
|
296
|
-
|
|
297
|
-
|
|
298
|
-
|
|
361
|
+
// Not a format error - don't retry
|
|
362
|
+
this.console.warn(`[Discovery] Scene analysis failed for ${cameraName}:`, e);
|
|
363
|
+
break;
|
|
364
|
+
}
|
|
365
|
+
}
|
|
299
366
|
|
|
300
|
-
|
|
301
|
-
|
|
302
|
-
|
|
303
|
-
|
|
304
|
-
|
|
305
|
-
|
|
367
|
+
// All formats failed
|
|
368
|
+
if (lastError) {
|
|
369
|
+
const errorStr = String(lastError);
|
|
370
|
+
if (isVisionNotSupportedError(lastError)) {
|
|
371
|
+
analysis.error = 'Vision/image analysis not supported by configured LLM. Ensure you have a vision-capable model (e.g., gpt-4o, gpt-4-turbo, claude-3-sonnet) configured.';
|
|
372
|
+
} else {
|
|
373
|
+
analysis.error = `Analysis failed: ${errorStr}`;
|
|
306
374
|
}
|
|
307
|
-
} catch (e) {
|
|
308
|
-
this.console.warn(`[Discovery] Scene analysis failed for ${cameraName}:`, e);
|
|
309
|
-
analysis.error = `Analysis failed: ${e}`;
|
|
310
375
|
}
|
|
311
376
|
|
|
312
377
|
// Cache the analysis
|