@blueharford/scrypted-spatial-awareness 0.5.3 → 0.5.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/main.nodejs.js +1 -1
- package/dist/main.nodejs.js.map +1 -1
- package/dist/plugin.zip +0 -0
- package/out/main.nodejs.js +154 -68
- package/out/main.nodejs.js.map +1 -1
- package/out/plugin.zip +0 -0
- package/package.json +1 -1
- package/src/core/spatial-reasoning.ts +51 -9
- package/src/core/topology-discovery.ts +120 -66
package/out/plugin.zip
CHANGED
|
Binary file
|
package/package.json
CHANGED
|
@@ -87,12 +87,25 @@ export async function mediaObjectToBase64(mediaObject: MediaObject): Promise<Ima
|
|
|
87
87
|
// Convert MediaObject to Buffer using mediaManager
|
|
88
88
|
const buffer = await mediaManager.convertMediaObjectToBuffer(mediaObject, ScryptedMimeTypes.Image);
|
|
89
89
|
|
|
90
|
+
if (!buffer || buffer.length === 0) {
|
|
91
|
+
console.warn('Failed to convert MediaObject: empty buffer');
|
|
92
|
+
return null;
|
|
93
|
+
}
|
|
94
|
+
|
|
90
95
|
// Convert buffer to base64 (raw, no data URL prefix)
|
|
91
96
|
const base64 = buffer.toString('base64');
|
|
92
97
|
|
|
98
|
+
// Validate base64 - check it's not empty and looks valid
|
|
99
|
+
if (!base64 || base64.length < 100) {
|
|
100
|
+
console.warn(`Invalid base64: length=${base64?.length || 0}`);
|
|
101
|
+
return null;
|
|
102
|
+
}
|
|
103
|
+
|
|
93
104
|
// Determine MIME type - default to JPEG for camera images
|
|
94
105
|
const mediaType = mediaObject.mimeType?.split(';')[0] || 'image/jpeg';
|
|
95
106
|
|
|
107
|
+
console.log(`[Image] Converted to base64: ${base64.length} chars, type=${mediaType}`);
|
|
108
|
+
|
|
96
109
|
return { base64, mediaType };
|
|
97
110
|
} catch (e) {
|
|
98
111
|
console.warn('Failed to convert MediaObject to base64:', e);
|
|
@@ -101,13 +114,13 @@ export async function mediaObjectToBase64(mediaObject: MediaObject): Promise<Ima
|
|
|
101
114
|
}
|
|
102
115
|
|
|
103
116
|
/** LLM Provider type for image format selection */
|
|
104
|
-
export type LlmProvider = 'openai' | 'anthropic' | 'unknown';
|
|
117
|
+
export type LlmProvider = 'openai' | 'anthropic' | 'scrypted' | 'unknown';
|
|
105
118
|
|
|
106
119
|
/**
|
|
107
120
|
* Build image content block for ChatCompletion API
|
|
108
|
-
* Supports
|
|
121
|
+
* Supports OpenAI, Anthropic, and @scrypted/llm formats
|
|
109
122
|
* @param imageData - Image data with base64 and media type
|
|
110
|
-
* @param provider - The LLM provider type
|
|
123
|
+
* @param provider - The LLM provider type
|
|
111
124
|
*/
|
|
112
125
|
export function buildImageContent(imageData: ImageData, provider: LlmProvider = 'unknown'): any {
|
|
113
126
|
if (provider === 'openai') {
|
|
@@ -116,10 +129,11 @@ export function buildImageContent(imageData: ImageData, provider: LlmProvider =
|
|
|
116
129
|
type: 'image_url',
|
|
117
130
|
image_url: {
|
|
118
131
|
url: `data:${imageData.mediaType};base64,${imageData.base64}`,
|
|
132
|
+
detail: 'auto',
|
|
119
133
|
},
|
|
120
134
|
};
|
|
121
135
|
} else if (provider === 'anthropic') {
|
|
122
|
-
// Anthropic format: uses
|
|
136
|
+
// Anthropic official format: uses 'data' key
|
|
123
137
|
return {
|
|
124
138
|
type: 'image',
|
|
125
139
|
source: {
|
|
@@ -128,18 +142,46 @@ export function buildImageContent(imageData: ImageData, provider: LlmProvider =
|
|
|
128
142
|
data: imageData.base64,
|
|
129
143
|
},
|
|
130
144
|
};
|
|
145
|
+
} else if (provider === 'scrypted') {
|
|
146
|
+
// @scrypted/llm format: uses 'base64' key (per error path .image.source.base64)
|
|
147
|
+
return {
|
|
148
|
+
type: 'image',
|
|
149
|
+
source: {
|
|
150
|
+
type: 'base64',
|
|
151
|
+
media_type: imageData.mediaType,
|
|
152
|
+
base64: imageData.base64,
|
|
153
|
+
},
|
|
154
|
+
};
|
|
131
155
|
} else {
|
|
132
|
-
// Unknown provider: try
|
|
133
|
-
// Most LLM wrappers (including @scrypted/llm) understand the OpenAI format
|
|
156
|
+
// Unknown provider: try @scrypted/llm format first
|
|
134
157
|
return {
|
|
135
|
-
type: '
|
|
136
|
-
|
|
137
|
-
|
|
158
|
+
type: 'image',
|
|
159
|
+
source: {
|
|
160
|
+
type: 'base64',
|
|
161
|
+
media_type: imageData.mediaType,
|
|
162
|
+
base64: imageData.base64,
|
|
138
163
|
},
|
|
139
164
|
};
|
|
140
165
|
}
|
|
141
166
|
}
|
|
142
167
|
|
|
168
|
+
/** Check if an error indicates vision/multimodal content format issue (should try alternate format) */
|
|
169
|
+
export function isVisionFormatError(error: any): boolean {
|
|
170
|
+
const errorStr = String(error);
|
|
171
|
+
return (
|
|
172
|
+
errorStr.includes('content.str') ||
|
|
173
|
+
errorStr.includes('should be a valid string') ||
|
|
174
|
+
errorStr.includes('Invalid content type') ||
|
|
175
|
+
errorStr.includes('does not support vision') ||
|
|
176
|
+
errorStr.includes('invalid base64') ||
|
|
177
|
+
errorStr.includes('Invalid base64') ||
|
|
178
|
+
errorStr.includes('.image.source') ||
|
|
179
|
+
errorStr.includes('.image_url') ||
|
|
180
|
+
(errorStr.includes('image_url') && errorStr.includes('not supported')) ||
|
|
181
|
+
(errorStr.includes('400') && errorStr.includes('content'))
|
|
182
|
+
);
|
|
183
|
+
}
|
|
184
|
+
|
|
143
185
|
export class SpatialReasoningEngine {
|
|
144
186
|
private config: SpatialReasoningConfig;
|
|
145
187
|
private console: Console;
|
|
@@ -30,7 +30,7 @@ import {
|
|
|
30
30
|
Landmark,
|
|
31
31
|
findCamera,
|
|
32
32
|
} from '../models/topology';
|
|
33
|
-
import { mediaObjectToBase64, buildImageContent, ImageData, LlmProvider } from './spatial-reasoning';
|
|
33
|
+
import { mediaObjectToBase64, buildImageContent, ImageData, LlmProvider, isVisionFormatError } from './spatial-reasoning';
|
|
34
34
|
|
|
35
35
|
const { systemManager } = sdk;
|
|
36
36
|
|
|
@@ -253,77 +253,131 @@ export class TopologyDiscoveryEngine {
|
|
|
253
253
|
return analysis;
|
|
254
254
|
}
|
|
255
255
|
|
|
256
|
-
|
|
257
|
-
|
|
258
|
-
|
|
259
|
-
|
|
260
|
-
|
|
261
|
-
|
|
262
|
-
|
|
263
|
-
|
|
264
|
-
|
|
265
|
-
|
|
266
|
-
|
|
267
|
-
|
|
268
|
-
|
|
269
|
-
|
|
270
|
-
|
|
271
|
-
|
|
272
|
-
|
|
273
|
-
|
|
274
|
-
try {
|
|
275
|
-
// Extract JSON from response (handle markdown code blocks)
|
|
276
|
-
let jsonStr = content.trim();
|
|
277
|
-
if (jsonStr.startsWith('```')) {
|
|
278
|
-
jsonStr = jsonStr.replace(/```json?\n?/g, '').replace(/```$/g, '').trim();
|
|
279
|
-
}
|
|
280
|
-
|
|
281
|
-
const parsed = JSON.parse(jsonStr);
|
|
282
|
-
|
|
283
|
-
// Map parsed data to our types
|
|
284
|
-
if (Array.isArray(parsed.landmarks)) {
|
|
285
|
-
analysis.landmarks = parsed.landmarks.map((l: any) => ({
|
|
286
|
-
name: l.name || 'Unknown',
|
|
287
|
-
type: this.mapLandmarkType(l.type),
|
|
288
|
-
confidence: typeof l.confidence === 'number' ? l.confidence : 0.7,
|
|
289
|
-
description: l.description || '',
|
|
290
|
-
boundingBox: l.boundingBox,
|
|
291
|
-
}));
|
|
292
|
-
}
|
|
256
|
+
// Try with detected provider format first, then fallback to alternates
|
|
257
|
+
// The order matters: try the most likely formats first
|
|
258
|
+
const formatsToTry: LlmProvider[] = [];
|
|
259
|
+
|
|
260
|
+
// Start with detected format
|
|
261
|
+
formatsToTry.push(this.llmProviderType);
|
|
262
|
+
|
|
263
|
+
// Add fallbacks based on detected provider
|
|
264
|
+
if (this.llmProviderType === 'openai') {
|
|
265
|
+
formatsToTry.push('scrypted', 'anthropic');
|
|
266
|
+
} else if (this.llmProviderType === 'anthropic') {
|
|
267
|
+
formatsToTry.push('scrypted', 'openai');
|
|
268
|
+
} else if (this.llmProviderType === 'scrypted') {
|
|
269
|
+
formatsToTry.push('anthropic', 'openai');
|
|
270
|
+
} else {
|
|
271
|
+
// Unknown - try all formats
|
|
272
|
+
formatsToTry.push('scrypted', 'anthropic', 'openai');
|
|
273
|
+
}
|
|
293
274
|
|
|
294
|
-
|
|
295
|
-
|
|
296
|
-
|
|
297
|
-
|
|
298
|
-
|
|
299
|
-
|
|
300
|
-
|
|
301
|
-
|
|
275
|
+
let lastError: any = null;
|
|
276
|
+
|
|
277
|
+
for (const formatType of formatsToTry) {
|
|
278
|
+
try {
|
|
279
|
+
this.console.log(`[Discovery] Trying ${formatType} image format for ${cameraName}...`);
|
|
280
|
+
|
|
281
|
+
// Build multimodal message with provider-specific image format
|
|
282
|
+
const result = await llm.getChatCompletion({
|
|
283
|
+
messages: [
|
|
284
|
+
{
|
|
285
|
+
role: 'user',
|
|
286
|
+
content: [
|
|
287
|
+
{ type: 'text', text: SCENE_ANALYSIS_PROMPT },
|
|
288
|
+
buildImageContent(imageData, formatType),
|
|
289
|
+
],
|
|
290
|
+
},
|
|
291
|
+
],
|
|
292
|
+
max_tokens: 500,
|
|
293
|
+
temperature: 0.3,
|
|
294
|
+
});
|
|
295
|
+
|
|
296
|
+
const content = result?.choices?.[0]?.message?.content;
|
|
297
|
+
if (content && typeof content === 'string') {
|
|
298
|
+
try {
|
|
299
|
+
// Extract JSON from response (handle markdown code blocks)
|
|
300
|
+
let jsonStr = content.trim();
|
|
301
|
+
if (jsonStr.startsWith('```')) {
|
|
302
|
+
jsonStr = jsonStr.replace(/```json?\n?/g, '').replace(/```$/g, '').trim();
|
|
303
|
+
}
|
|
304
|
+
|
|
305
|
+
const parsed = JSON.parse(jsonStr);
|
|
306
|
+
|
|
307
|
+
// Map parsed data to our types
|
|
308
|
+
if (Array.isArray(parsed.landmarks)) {
|
|
309
|
+
analysis.landmarks = parsed.landmarks.map((l: any) => ({
|
|
310
|
+
name: l.name || 'Unknown',
|
|
311
|
+
type: this.mapLandmarkType(l.type),
|
|
312
|
+
confidence: typeof l.confidence === 'number' ? l.confidence : 0.7,
|
|
313
|
+
description: l.description || '',
|
|
314
|
+
boundingBox: l.boundingBox,
|
|
315
|
+
}));
|
|
316
|
+
}
|
|
317
|
+
|
|
318
|
+
if (Array.isArray(parsed.zones)) {
|
|
319
|
+
analysis.zones = parsed.zones.map((z: any) => ({
|
|
320
|
+
name: z.name || 'Unknown',
|
|
321
|
+
type: this.mapZoneType(z.type),
|
|
322
|
+
coverage: typeof z.coverage === 'number' ? z.coverage : 0.5,
|
|
323
|
+
description: z.description || '',
|
|
324
|
+
boundingBox: z.boundingBox,
|
|
325
|
+
}));
|
|
326
|
+
}
|
|
327
|
+
|
|
328
|
+
if (parsed.edges && typeof parsed.edges === 'object') {
|
|
329
|
+
analysis.edges = {
|
|
330
|
+
top: parsed.edges.top || '',
|
|
331
|
+
left: parsed.edges.left || '',
|
|
332
|
+
right: parsed.edges.right || '',
|
|
333
|
+
bottom: parsed.edges.bottom || '',
|
|
334
|
+
};
|
|
335
|
+
}
|
|
336
|
+
|
|
337
|
+
if (parsed.orientation) {
|
|
338
|
+
analysis.orientation = this.mapOrientation(parsed.orientation);
|
|
339
|
+
}
|
|
340
|
+
|
|
341
|
+
analysis.isValid = true;
|
|
342
|
+
this.console.log(`[Discovery] Analyzed ${cameraName}: ${analysis.landmarks.length} landmarks, ${analysis.zones.length} zones (using ${formatType} format)`);
|
|
343
|
+
|
|
344
|
+
// Update the preferred format for future requests
|
|
345
|
+
if (formatType !== this.llmProviderType) {
|
|
346
|
+
this.console.log(`[Discovery] Switching to ${formatType} format for future requests`);
|
|
347
|
+
this.llmProviderType = formatType;
|
|
348
|
+
}
|
|
349
|
+
|
|
350
|
+
// Success - exit the retry loop
|
|
351
|
+
return analysis;
|
|
352
|
+
} catch (parseError) {
|
|
353
|
+
this.console.warn(`[Discovery] Failed to parse LLM response for ${cameraName}:`, parseError);
|
|
354
|
+
analysis.error = 'Failed to parse LLM response';
|
|
355
|
+
return analysis;
|
|
302
356
|
}
|
|
357
|
+
}
|
|
358
|
+
} catch (e) {
|
|
359
|
+
lastError = e;
|
|
303
360
|
|
|
304
|
-
|
|
305
|
-
|
|
306
|
-
|
|
307
|
-
|
|
308
|
-
|
|
309
|
-
bottom: parsed.edges.bottom || '',
|
|
310
|
-
};
|
|
311
|
-
}
|
|
361
|
+
// Check if this is a vision/multimodal format error
|
|
362
|
+
if (isVisionFormatError(e)) {
|
|
363
|
+
this.console.warn(`[Discovery] ${formatType} format failed, trying fallback...`);
|
|
364
|
+
continue; // Try next format
|
|
365
|
+
}
|
|
312
366
|
|
|
313
|
-
|
|
314
|
-
|
|
315
|
-
|
|
367
|
+
// Not a format error - don't retry
|
|
368
|
+
this.console.warn(`[Discovery] Scene analysis failed for ${cameraName}:`, e);
|
|
369
|
+
break;
|
|
370
|
+
}
|
|
371
|
+
}
|
|
316
372
|
|
|
317
|
-
|
|
318
|
-
|
|
319
|
-
|
|
320
|
-
|
|
321
|
-
|
|
322
|
-
|
|
373
|
+
// All formats failed
|
|
374
|
+
if (lastError) {
|
|
375
|
+
const errorStr = String(lastError);
|
|
376
|
+
if (isVisionFormatError(lastError)) {
|
|
377
|
+
analysis.error = 'Vision/image analysis failed with all formats. Ensure you have a vision-capable model (e.g., gpt-4o, gpt-4-turbo, claude-3-sonnet) configured and the @scrypted/llm plugin supports vision.';
|
|
378
|
+
} else {
|
|
379
|
+
analysis.error = `Analysis failed: ${errorStr}`;
|
|
323
380
|
}
|
|
324
|
-
} catch (e) {
|
|
325
|
-
this.console.warn(`[Discovery] Scene analysis failed for ${cameraName}:`, e);
|
|
326
|
-
analysis.error = `Analysis failed: ${e}`;
|
|
327
381
|
}
|
|
328
382
|
|
|
329
383
|
// Cache the analysis
|