@blueharford/scrypted-spatial-awareness 0.5.3 → 0.5.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/main.nodejs.js +1 -1
- package/dist/main.nodejs.js.map +1 -1
- package/dist/plugin.zip +0 -0
- package/out/main.nodejs.js +121 -65
- package/out/main.nodejs.js.map +1 -1
- package/out/plugin.zip +0 -0
- package/package.json +1 -1
- package/src/core/spatial-reasoning.ts +21 -5
- package/src/core/topology-discovery.ts +114 -66
package/out/plugin.zip
CHANGED
|
Binary file
|
package/package.json
CHANGED
|
@@ -112,10 +112,12 @@ export type LlmProvider = 'openai' | 'anthropic' | 'unknown';
|
|
|
112
112
|
export function buildImageContent(imageData: ImageData, provider: LlmProvider = 'unknown'): any {
|
|
113
113
|
if (provider === 'openai') {
|
|
114
114
|
// OpenAI format: uses data URL with image_url wrapper
|
|
115
|
+
// Include detail parameter for compatibility
|
|
115
116
|
return {
|
|
116
117
|
type: 'image_url',
|
|
117
118
|
image_url: {
|
|
118
119
|
url: `data:${imageData.mediaType};base64,${imageData.base64}`,
|
|
120
|
+
detail: 'auto',
|
|
119
121
|
},
|
|
120
122
|
};
|
|
121
123
|
} else if (provider === 'anthropic') {
|
|
@@ -129,17 +131,31 @@ export function buildImageContent(imageData: ImageData, provider: LlmProvider =
|
|
|
129
131
|
},
|
|
130
132
|
};
|
|
131
133
|
} else {
|
|
132
|
-
// Unknown provider: try
|
|
133
|
-
//
|
|
134
|
+
// Unknown provider: try Anthropic format first as it's more explicit
|
|
135
|
+
// Some plugins may translate this to OpenAI format internally
|
|
134
136
|
return {
|
|
135
|
-
type: '
|
|
136
|
-
|
|
137
|
-
|
|
137
|
+
type: 'image',
|
|
138
|
+
source: {
|
|
139
|
+
type: 'base64',
|
|
140
|
+
media_type: imageData.mediaType,
|
|
141
|
+
data: imageData.base64,
|
|
138
142
|
},
|
|
139
143
|
};
|
|
140
144
|
}
|
|
141
145
|
}
|
|
142
146
|
|
|
147
|
+
/** Check if an error indicates vision/multimodal content is not supported */
|
|
148
|
+
export function isVisionNotSupportedError(error: any): boolean {
|
|
149
|
+
const errorStr = String(error);
|
|
150
|
+
return (
|
|
151
|
+
errorStr.includes('content.str') ||
|
|
152
|
+
errorStr.includes('should be a valid string') ||
|
|
153
|
+
errorStr.includes('Invalid content type') ||
|
|
154
|
+
errorStr.includes('does not support vision') ||
|
|
155
|
+
errorStr.includes('image_url') && errorStr.includes('not supported')
|
|
156
|
+
);
|
|
157
|
+
}
|
|
158
|
+
|
|
143
159
|
export class SpatialReasoningEngine {
|
|
144
160
|
private config: SpatialReasoningConfig;
|
|
145
161
|
private console: Console;
|
|
@@ -30,7 +30,7 @@ import {
|
|
|
30
30
|
Landmark,
|
|
31
31
|
findCamera,
|
|
32
32
|
} from '../models/topology';
|
|
33
|
-
import { mediaObjectToBase64, buildImageContent, ImageData, LlmProvider } from './spatial-reasoning';
|
|
33
|
+
import { mediaObjectToBase64, buildImageContent, ImageData, LlmProvider, isVisionNotSupportedError } from './spatial-reasoning';
|
|
34
34
|
|
|
35
35
|
const { systemManager } = sdk;
|
|
36
36
|
|
|
@@ -253,77 +253,125 @@ export class TopologyDiscoveryEngine {
|
|
|
253
253
|
return analysis;
|
|
254
254
|
}
|
|
255
255
|
|
|
256
|
-
|
|
257
|
-
|
|
258
|
-
|
|
259
|
-
|
|
260
|
-
|
|
261
|
-
|
|
262
|
-
|
|
263
|
-
|
|
264
|
-
|
|
265
|
-
|
|
266
|
-
|
|
267
|
-
|
|
268
|
-
max_tokens: 500,
|
|
269
|
-
temperature: 0.3,
|
|
270
|
-
});
|
|
271
|
-
|
|
272
|
-
const content = result?.choices?.[0]?.message?.content;
|
|
273
|
-
if (content && typeof content === 'string') {
|
|
274
|
-
try {
|
|
275
|
-
// Extract JSON from response (handle markdown code blocks)
|
|
276
|
-
let jsonStr = content.trim();
|
|
277
|
-
if (jsonStr.startsWith('```')) {
|
|
278
|
-
jsonStr = jsonStr.replace(/```json?\n?/g, '').replace(/```$/g, '').trim();
|
|
279
|
-
}
|
|
280
|
-
|
|
281
|
-
const parsed = JSON.parse(jsonStr);
|
|
282
|
-
|
|
283
|
-
// Map parsed data to our types
|
|
284
|
-
if (Array.isArray(parsed.landmarks)) {
|
|
285
|
-
analysis.landmarks = parsed.landmarks.map((l: any) => ({
|
|
286
|
-
name: l.name || 'Unknown',
|
|
287
|
-
type: this.mapLandmarkType(l.type),
|
|
288
|
-
confidence: typeof l.confidence === 'number' ? l.confidence : 0.7,
|
|
289
|
-
description: l.description || '',
|
|
290
|
-
boundingBox: l.boundingBox,
|
|
291
|
-
}));
|
|
292
|
-
}
|
|
256
|
+
// Try with detected provider format first, then fallback to alternate format
|
|
257
|
+
const formatsToTry: LlmProvider[] = [this.llmProviderType];
|
|
258
|
+
|
|
259
|
+
// Add fallback format
|
|
260
|
+
if (this.llmProviderType === 'openai') {
|
|
261
|
+
formatsToTry.push('anthropic');
|
|
262
|
+
} else if (this.llmProviderType === 'anthropic') {
|
|
263
|
+
formatsToTry.push('openai');
|
|
264
|
+
} else {
|
|
265
|
+
// Unknown - try both
|
|
266
|
+
formatsToTry.push('openai');
|
|
267
|
+
}
|
|
293
268
|
|
|
294
|
-
|
|
295
|
-
|
|
296
|
-
|
|
297
|
-
|
|
298
|
-
|
|
299
|
-
|
|
300
|
-
|
|
301
|
-
|
|
269
|
+
let lastError: any = null;
|
|
270
|
+
|
|
271
|
+
for (const formatType of formatsToTry) {
|
|
272
|
+
try {
|
|
273
|
+
this.console.log(`[Discovery] Trying ${formatType} image format for ${cameraName}...`);
|
|
274
|
+
|
|
275
|
+
// Build multimodal message with provider-specific image format
|
|
276
|
+
const result = await llm.getChatCompletion({
|
|
277
|
+
messages: [
|
|
278
|
+
{
|
|
279
|
+
role: 'user',
|
|
280
|
+
content: [
|
|
281
|
+
{ type: 'text', text: SCENE_ANALYSIS_PROMPT },
|
|
282
|
+
buildImageContent(imageData, formatType),
|
|
283
|
+
],
|
|
284
|
+
},
|
|
285
|
+
],
|
|
286
|
+
max_tokens: 500,
|
|
287
|
+
temperature: 0.3,
|
|
288
|
+
});
|
|
289
|
+
|
|
290
|
+
const content = result?.choices?.[0]?.message?.content;
|
|
291
|
+
if (content && typeof content === 'string') {
|
|
292
|
+
try {
|
|
293
|
+
// Extract JSON from response (handle markdown code blocks)
|
|
294
|
+
let jsonStr = content.trim();
|
|
295
|
+
if (jsonStr.startsWith('```')) {
|
|
296
|
+
jsonStr = jsonStr.replace(/```json?\n?/g, '').replace(/```$/g, '').trim();
|
|
297
|
+
}
|
|
298
|
+
|
|
299
|
+
const parsed = JSON.parse(jsonStr);
|
|
300
|
+
|
|
301
|
+
// Map parsed data to our types
|
|
302
|
+
if (Array.isArray(parsed.landmarks)) {
|
|
303
|
+
analysis.landmarks = parsed.landmarks.map((l: any) => ({
|
|
304
|
+
name: l.name || 'Unknown',
|
|
305
|
+
type: this.mapLandmarkType(l.type),
|
|
306
|
+
confidence: typeof l.confidence === 'number' ? l.confidence : 0.7,
|
|
307
|
+
description: l.description || '',
|
|
308
|
+
boundingBox: l.boundingBox,
|
|
309
|
+
}));
|
|
310
|
+
}
|
|
311
|
+
|
|
312
|
+
if (Array.isArray(parsed.zones)) {
|
|
313
|
+
analysis.zones = parsed.zones.map((z: any) => ({
|
|
314
|
+
name: z.name || 'Unknown',
|
|
315
|
+
type: this.mapZoneType(z.type),
|
|
316
|
+
coverage: typeof z.coverage === 'number' ? z.coverage : 0.5,
|
|
317
|
+
description: z.description || '',
|
|
318
|
+
boundingBox: z.boundingBox,
|
|
319
|
+
}));
|
|
320
|
+
}
|
|
321
|
+
|
|
322
|
+
if (parsed.edges && typeof parsed.edges === 'object') {
|
|
323
|
+
analysis.edges = {
|
|
324
|
+
top: parsed.edges.top || '',
|
|
325
|
+
left: parsed.edges.left || '',
|
|
326
|
+
right: parsed.edges.right || '',
|
|
327
|
+
bottom: parsed.edges.bottom || '',
|
|
328
|
+
};
|
|
329
|
+
}
|
|
330
|
+
|
|
331
|
+
if (parsed.orientation) {
|
|
332
|
+
analysis.orientation = this.mapOrientation(parsed.orientation);
|
|
333
|
+
}
|
|
334
|
+
|
|
335
|
+
analysis.isValid = true;
|
|
336
|
+
this.console.log(`[Discovery] Analyzed ${cameraName}: ${analysis.landmarks.length} landmarks, ${analysis.zones.length} zones (using ${formatType} format)`);
|
|
337
|
+
|
|
338
|
+
// Update the preferred format for future requests
|
|
339
|
+
if (formatType !== this.llmProviderType) {
|
|
340
|
+
this.console.log(`[Discovery] Switching to ${formatType} format for future requests`);
|
|
341
|
+
this.llmProviderType = formatType;
|
|
342
|
+
}
|
|
343
|
+
|
|
344
|
+
// Success - exit the retry loop
|
|
345
|
+
return analysis;
|
|
346
|
+
} catch (parseError) {
|
|
347
|
+
this.console.warn(`[Discovery] Failed to parse LLM response for ${cameraName}:`, parseError);
|
|
348
|
+
analysis.error = 'Failed to parse LLM response';
|
|
349
|
+
return analysis;
|
|
302
350
|
}
|
|
351
|
+
}
|
|
352
|
+
} catch (e) {
|
|
353
|
+
lastError = e;
|
|
303
354
|
|
|
304
|
-
|
|
305
|
-
|
|
306
|
-
|
|
307
|
-
|
|
308
|
-
|
|
309
|
-
bottom: parsed.edges.bottom || '',
|
|
310
|
-
};
|
|
311
|
-
}
|
|
355
|
+
// Check if this is a vision/multimodal format error
|
|
356
|
+
if (isVisionNotSupportedError(e)) {
|
|
357
|
+
this.console.warn(`[Discovery] ${formatType} format not supported, trying fallback...`);
|
|
358
|
+
continue; // Try next format
|
|
359
|
+
}
|
|
312
360
|
|
|
313
|
-
|
|
314
|
-
|
|
315
|
-
|
|
361
|
+
// Not a format error - don't retry
|
|
362
|
+
this.console.warn(`[Discovery] Scene analysis failed for ${cameraName}:`, e);
|
|
363
|
+
break;
|
|
364
|
+
}
|
|
365
|
+
}
|
|
316
366
|
|
|
317
|
-
|
|
318
|
-
|
|
319
|
-
|
|
320
|
-
|
|
321
|
-
|
|
322
|
-
|
|
367
|
+
// All formats failed
|
|
368
|
+
if (lastError) {
|
|
369
|
+
const errorStr = String(lastError);
|
|
370
|
+
if (isVisionNotSupportedError(lastError)) {
|
|
371
|
+
analysis.error = 'Vision/image analysis not supported by configured LLM. Ensure you have a vision-capable model (e.g., gpt-4o, gpt-4-turbo, claude-3-sonnet) configured.';
|
|
372
|
+
} else {
|
|
373
|
+
analysis.error = `Analysis failed: ${errorStr}`;
|
|
323
374
|
}
|
|
324
|
-
} catch (e) {
|
|
325
|
-
this.console.warn(`[Discovery] Scene analysis failed for ${cameraName}:`, e);
|
|
326
|
-
analysis.error = `Analysis failed: ${e}`;
|
|
327
375
|
}
|
|
328
376
|
|
|
329
377
|
// Cache the analysis
|