@blueharford/scrypted-spatial-awareness 0.5.3 → 0.5.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/main.nodejs.js +1 -1
- package/dist/main.nodejs.js.map +1 -1
- package/dist/plugin.zip +0 -0
- package/out/main.nodejs.js +154 -68
- package/out/main.nodejs.js.map +1 -1
- package/out/plugin.zip +0 -0
- package/package.json +1 -1
- package/src/core/spatial-reasoning.ts +51 -9
- package/src/core/topology-discovery.ts +120 -66
package/dist/plugin.zip
CHANGED
|
Binary file
|
package/out/main.nodejs.js
CHANGED
|
@@ -35095,6 +35095,7 @@ Object.defineProperty(exports, "__esModule", ({ value: true }));
|
|
|
35095
35095
|
exports.SpatialReasoningEngine = void 0;
|
|
35096
35096
|
exports.mediaObjectToBase64 = mediaObjectToBase64;
|
|
35097
35097
|
exports.buildImageContent = buildImageContent;
|
|
35098
|
+
exports.isVisionFormatError = isVisionFormatError;
|
|
35098
35099
|
const sdk_1 = __importStar(__webpack_require__(/*! @scrypted/sdk */ "./node_modules/@scrypted/sdk/dist/src/index.js"));
|
|
35099
35100
|
const topology_1 = __webpack_require__(/*! ../models/topology */ "./src/models/topology.ts");
|
|
35100
35101
|
const { systemManager, mediaManager } = sdk_1.default;
|
|
@@ -35107,10 +35108,20 @@ async function mediaObjectToBase64(mediaObject) {
|
|
|
35107
35108
|
try {
|
|
35108
35109
|
// Convert MediaObject to Buffer using mediaManager
|
|
35109
35110
|
const buffer = await mediaManager.convertMediaObjectToBuffer(mediaObject, sdk_1.ScryptedMimeTypes.Image);
|
|
35111
|
+
if (!buffer || buffer.length === 0) {
|
|
35112
|
+
console.warn('Failed to convert MediaObject: empty buffer');
|
|
35113
|
+
return null;
|
|
35114
|
+
}
|
|
35110
35115
|
// Convert buffer to base64 (raw, no data URL prefix)
|
|
35111
35116
|
const base64 = buffer.toString('base64');
|
|
35117
|
+
// Validate base64 - check it's not empty and looks valid
|
|
35118
|
+
if (!base64 || base64.length < 100) {
|
|
35119
|
+
console.warn(`Invalid base64: length=${base64?.length || 0}`);
|
|
35120
|
+
return null;
|
|
35121
|
+
}
|
|
35112
35122
|
// Determine MIME type - default to JPEG for camera images
|
|
35113
35123
|
const mediaType = mediaObject.mimeType?.split(';')[0] || 'image/jpeg';
|
|
35124
|
+
console.log(`[Image] Converted to base64: ${base64.length} chars, type=${mediaType}`);
|
|
35114
35125
|
return { base64, mediaType };
|
|
35115
35126
|
}
|
|
35116
35127
|
catch (e) {
|
|
@@ -35120,9 +35131,9 @@ async function mediaObjectToBase64(mediaObject) {
|
|
|
35120
35131
|
}
|
|
35121
35132
|
/**
|
|
35122
35133
|
* Build image content block for ChatCompletion API
|
|
35123
|
-
* Supports
|
|
35134
|
+
* Supports OpenAI, Anthropic, and @scrypted/llm formats
|
|
35124
35135
|
* @param imageData - Image data with base64 and media type
|
|
35125
|
-
* @param provider - The LLM provider type
|
|
35136
|
+
* @param provider - The LLM provider type
|
|
35126
35137
|
*/
|
|
35127
35138
|
function buildImageContent(imageData, provider = 'unknown') {
|
|
35128
35139
|
if (provider === 'openai') {
|
|
@@ -35131,11 +35142,12 @@ function buildImageContent(imageData, provider = 'unknown') {
|
|
|
35131
35142
|
type: 'image_url',
|
|
35132
35143
|
image_url: {
|
|
35133
35144
|
url: `data:${imageData.mediaType};base64,${imageData.base64}`,
|
|
35145
|
+
detail: 'auto',
|
|
35134
35146
|
},
|
|
35135
35147
|
};
|
|
35136
35148
|
}
|
|
35137
35149
|
else if (provider === 'anthropic') {
|
|
35138
|
-
// Anthropic format: uses
|
|
35150
|
+
// Anthropic official format: uses 'data' key
|
|
35139
35151
|
return {
|
|
35140
35152
|
type: 'image',
|
|
35141
35153
|
source: {
|
|
@@ -35145,17 +35157,43 @@ function buildImageContent(imageData, provider = 'unknown') {
|
|
|
35145
35157
|
},
|
|
35146
35158
|
};
|
|
35147
35159
|
}
|
|
35160
|
+
else if (provider === 'scrypted') {
|
|
35161
|
+
// @scrypted/llm format: uses 'base64' key (per error path .image.source.base64)
|
|
35162
|
+
return {
|
|
35163
|
+
type: 'image',
|
|
35164
|
+
source: {
|
|
35165
|
+
type: 'base64',
|
|
35166
|
+
media_type: imageData.mediaType,
|
|
35167
|
+
base64: imageData.base64,
|
|
35168
|
+
},
|
|
35169
|
+
};
|
|
35170
|
+
}
|
|
35148
35171
|
else {
|
|
35149
|
-
// Unknown provider: try
|
|
35150
|
-
// Most LLM wrappers (including @scrypted/llm) understand the OpenAI format
|
|
35172
|
+
// Unknown provider: try @scrypted/llm format first
|
|
35151
35173
|
return {
|
|
35152
|
-
type: '
|
|
35153
|
-
|
|
35154
|
-
|
|
35174
|
+
type: 'image',
|
|
35175
|
+
source: {
|
|
35176
|
+
type: 'base64',
|
|
35177
|
+
media_type: imageData.mediaType,
|
|
35178
|
+
base64: imageData.base64,
|
|
35155
35179
|
},
|
|
35156
35180
|
};
|
|
35157
35181
|
}
|
|
35158
35182
|
}
|
|
35183
|
+
/** Check if an error indicates vision/multimodal content format issue (should try alternate format) */
|
|
35184
|
+
function isVisionFormatError(error) {
|
|
35185
|
+
const errorStr = String(error);
|
|
35186
|
+
return (errorStr.includes('content.str') ||
|
|
35187
|
+
errorStr.includes('should be a valid string') ||
|
|
35188
|
+
errorStr.includes('Invalid content type') ||
|
|
35189
|
+
errorStr.includes('does not support vision') ||
|
|
35190
|
+
errorStr.includes('invalid base64') ||
|
|
35191
|
+
errorStr.includes('Invalid base64') ||
|
|
35192
|
+
errorStr.includes('.image.source') ||
|
|
35193
|
+
errorStr.includes('.image_url') ||
|
|
35194
|
+
(errorStr.includes('image_url') && errorStr.includes('not supported')) ||
|
|
35195
|
+
(errorStr.includes('400') && errorStr.includes('content')));
|
|
35196
|
+
}
|
|
35159
35197
|
class SpatialReasoningEngine {
|
|
35160
35198
|
config;
|
|
35161
35199
|
console;
|
|
@@ -36186,72 +36224,120 @@ class TopologyDiscoveryEngine {
|
|
|
36186
36224
|
analysis.error = 'Failed to capture camera snapshot';
|
|
36187
36225
|
return analysis;
|
|
36188
36226
|
}
|
|
36189
|
-
|
|
36190
|
-
|
|
36191
|
-
|
|
36192
|
-
|
|
36193
|
-
|
|
36194
|
-
|
|
36195
|
-
|
|
36196
|
-
|
|
36197
|
-
|
|
36198
|
-
|
|
36199
|
-
|
|
36200
|
-
|
|
36201
|
-
|
|
36202
|
-
|
|
36203
|
-
|
|
36204
|
-
|
|
36205
|
-
|
|
36206
|
-
|
|
36207
|
-
|
|
36208
|
-
|
|
36209
|
-
|
|
36210
|
-
|
|
36211
|
-
|
|
36212
|
-
|
|
36213
|
-
|
|
36214
|
-
|
|
36215
|
-
|
|
36216
|
-
|
|
36217
|
-
|
|
36218
|
-
|
|
36219
|
-
|
|
36220
|
-
|
|
36221
|
-
}
|
|
36222
|
-
|
|
36223
|
-
|
|
36224
|
-
|
|
36225
|
-
|
|
36226
|
-
|
|
36227
|
-
|
|
36228
|
-
|
|
36229
|
-
|
|
36230
|
-
|
|
36231
|
-
|
|
36232
|
-
|
|
36233
|
-
|
|
36234
|
-
|
|
36235
|
-
|
|
36236
|
-
|
|
36237
|
-
|
|
36238
|
-
|
|
36227
|
+
// Try with detected provider format first, then fallback to alternates
|
|
36228
|
+
// The order matters: try the most likely formats first
|
|
36229
|
+
const formatsToTry = [];
|
|
36230
|
+
// Start with detected format
|
|
36231
|
+
formatsToTry.push(this.llmProviderType);
|
|
36232
|
+
// Add fallbacks based on detected provider
|
|
36233
|
+
if (this.llmProviderType === 'openai') {
|
|
36234
|
+
formatsToTry.push('scrypted', 'anthropic');
|
|
36235
|
+
}
|
|
36236
|
+
else if (this.llmProviderType === 'anthropic') {
|
|
36237
|
+
formatsToTry.push('scrypted', 'openai');
|
|
36238
|
+
}
|
|
36239
|
+
else if (this.llmProviderType === 'scrypted') {
|
|
36240
|
+
formatsToTry.push('anthropic', 'openai');
|
|
36241
|
+
}
|
|
36242
|
+
else {
|
|
36243
|
+
// Unknown - try all formats
|
|
36244
|
+
formatsToTry.push('scrypted', 'anthropic', 'openai');
|
|
36245
|
+
}
|
|
36246
|
+
let lastError = null;
|
|
36247
|
+
for (const formatType of formatsToTry) {
|
|
36248
|
+
try {
|
|
36249
|
+
this.console.log(`[Discovery] Trying ${formatType} image format for ${cameraName}...`);
|
|
36250
|
+
// Build multimodal message with provider-specific image format
|
|
36251
|
+
const result = await llm.getChatCompletion({
|
|
36252
|
+
messages: [
|
|
36253
|
+
{
|
|
36254
|
+
role: 'user',
|
|
36255
|
+
content: [
|
|
36256
|
+
{ type: 'text', text: SCENE_ANALYSIS_PROMPT },
|
|
36257
|
+
(0, spatial_reasoning_1.buildImageContent)(imageData, formatType),
|
|
36258
|
+
],
|
|
36259
|
+
},
|
|
36260
|
+
],
|
|
36261
|
+
max_tokens: 500,
|
|
36262
|
+
temperature: 0.3,
|
|
36263
|
+
});
|
|
36264
|
+
const content = result?.choices?.[0]?.message?.content;
|
|
36265
|
+
if (content && typeof content === 'string') {
|
|
36266
|
+
try {
|
|
36267
|
+
// Extract JSON from response (handle markdown code blocks)
|
|
36268
|
+
let jsonStr = content.trim();
|
|
36269
|
+
if (jsonStr.startsWith('```')) {
|
|
36270
|
+
jsonStr = jsonStr.replace(/```json?\n?/g, '').replace(/```$/g, '').trim();
|
|
36271
|
+
}
|
|
36272
|
+
const parsed = JSON.parse(jsonStr);
|
|
36273
|
+
// Map parsed data to our types
|
|
36274
|
+
if (Array.isArray(parsed.landmarks)) {
|
|
36275
|
+
analysis.landmarks = parsed.landmarks.map((l) => ({
|
|
36276
|
+
name: l.name || 'Unknown',
|
|
36277
|
+
type: this.mapLandmarkType(l.type),
|
|
36278
|
+
confidence: typeof l.confidence === 'number' ? l.confidence : 0.7,
|
|
36279
|
+
description: l.description || '',
|
|
36280
|
+
boundingBox: l.boundingBox,
|
|
36281
|
+
}));
|
|
36282
|
+
}
|
|
36283
|
+
if (Array.isArray(parsed.zones)) {
|
|
36284
|
+
analysis.zones = parsed.zones.map((z) => ({
|
|
36285
|
+
name: z.name || 'Unknown',
|
|
36286
|
+
type: this.mapZoneType(z.type),
|
|
36287
|
+
coverage: typeof z.coverage === 'number' ? z.coverage : 0.5,
|
|
36288
|
+
description: z.description || '',
|
|
36289
|
+
boundingBox: z.boundingBox,
|
|
36290
|
+
}));
|
|
36291
|
+
}
|
|
36292
|
+
if (parsed.edges && typeof parsed.edges === 'object') {
|
|
36293
|
+
analysis.edges = {
|
|
36294
|
+
top: parsed.edges.top || '',
|
|
36295
|
+
left: parsed.edges.left || '',
|
|
36296
|
+
right: parsed.edges.right || '',
|
|
36297
|
+
bottom: parsed.edges.bottom || '',
|
|
36298
|
+
};
|
|
36299
|
+
}
|
|
36300
|
+
if (parsed.orientation) {
|
|
36301
|
+
analysis.orientation = this.mapOrientation(parsed.orientation);
|
|
36302
|
+
}
|
|
36303
|
+
analysis.isValid = true;
|
|
36304
|
+
this.console.log(`[Discovery] Analyzed ${cameraName}: ${analysis.landmarks.length} landmarks, ${analysis.zones.length} zones (using ${formatType} format)`);
|
|
36305
|
+
// Update the preferred format for future requests
|
|
36306
|
+
if (formatType !== this.llmProviderType) {
|
|
36307
|
+
this.console.log(`[Discovery] Switching to ${formatType} format for future requests`);
|
|
36308
|
+
this.llmProviderType = formatType;
|
|
36309
|
+
}
|
|
36310
|
+
// Success - exit the retry loop
|
|
36311
|
+
return analysis;
|
|
36239
36312
|
}
|
|
36240
|
-
|
|
36241
|
-
|
|
36313
|
+
catch (parseError) {
|
|
36314
|
+
this.console.warn(`[Discovery] Failed to parse LLM response for ${cameraName}:`, parseError);
|
|
36315
|
+
analysis.error = 'Failed to parse LLM response';
|
|
36316
|
+
return analysis;
|
|
36242
36317
|
}
|
|
36243
|
-
analysis.isValid = true;
|
|
36244
|
-
this.console.log(`[Discovery] Analyzed ${cameraName}: ${analysis.landmarks.length} landmarks, ${analysis.zones.length} zones`);
|
|
36245
36318
|
}
|
|
36246
|
-
|
|
36247
|
-
|
|
36248
|
-
|
|
36319
|
+
}
|
|
36320
|
+
catch (e) {
|
|
36321
|
+
lastError = e;
|
|
36322
|
+
// Check if this is a vision/multimodal format error
|
|
36323
|
+
if ((0, spatial_reasoning_1.isVisionFormatError)(e)) {
|
|
36324
|
+
this.console.warn(`[Discovery] ${formatType} format failed, trying fallback...`);
|
|
36325
|
+
continue; // Try next format
|
|
36249
36326
|
}
|
|
36327
|
+
// Not a format error - don't retry
|
|
36328
|
+
this.console.warn(`[Discovery] Scene analysis failed for ${cameraName}:`, e);
|
|
36329
|
+
break;
|
|
36250
36330
|
}
|
|
36251
36331
|
}
|
|
36252
|
-
|
|
36253
|
-
|
|
36254
|
-
|
|
36332
|
+
// All formats failed
|
|
36333
|
+
if (lastError) {
|
|
36334
|
+
const errorStr = String(lastError);
|
|
36335
|
+
if ((0, spatial_reasoning_1.isVisionFormatError)(lastError)) {
|
|
36336
|
+
analysis.error = 'Vision/image analysis failed with all formats. Ensure you have a vision-capable model (e.g., gpt-4o, gpt-4-turbo, claude-3-sonnet) configured and the @scrypted/llm plugin supports vision.';
|
|
36337
|
+
}
|
|
36338
|
+
else {
|
|
36339
|
+
analysis.error = `Analysis failed: ${errorStr}`;
|
|
36340
|
+
}
|
|
36255
36341
|
}
|
|
36256
36342
|
// Cache the analysis
|
|
36257
36343
|
this.sceneCache.set(cameraId, analysis);
|