@blueharford/scrypted-spatial-awareness 0.5.3 → 0.5.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/main.nodejs.js +1 -1
- package/dist/main.nodejs.js.map +1 -1
- package/dist/plugin.zip +0 -0
- package/out/main.nodejs.js +121 -65
- package/out/main.nodejs.js.map +1 -1
- package/out/plugin.zip +0 -0
- package/package.json +1 -1
- package/src/core/spatial-reasoning.ts +21 -5
- package/src/core/topology-discovery.ts +114 -66
package/dist/plugin.zip
CHANGED
|
Binary file
|
package/out/main.nodejs.js
CHANGED
|
@@ -35095,6 +35095,7 @@ Object.defineProperty(exports, "__esModule", ({ value: true }));
|
|
|
35095
35095
|
exports.SpatialReasoningEngine = void 0;
|
|
35096
35096
|
exports.mediaObjectToBase64 = mediaObjectToBase64;
|
|
35097
35097
|
exports.buildImageContent = buildImageContent;
|
|
35098
|
+
exports.isVisionNotSupportedError = isVisionNotSupportedError;
|
|
35098
35099
|
const sdk_1 = __importStar(__webpack_require__(/*! @scrypted/sdk */ "./node_modules/@scrypted/sdk/dist/src/index.js"));
|
|
35099
35100
|
const topology_1 = __webpack_require__(/*! ../models/topology */ "./src/models/topology.ts");
|
|
35100
35101
|
const { systemManager, mediaManager } = sdk_1.default;
|
|
@@ -35127,10 +35128,12 @@ async function mediaObjectToBase64(mediaObject) {
|
|
|
35127
35128
|
function buildImageContent(imageData, provider = 'unknown') {
|
|
35128
35129
|
if (provider === 'openai') {
|
|
35129
35130
|
// OpenAI format: uses data URL with image_url wrapper
|
|
35131
|
+
// Include detail parameter for compatibility
|
|
35130
35132
|
return {
|
|
35131
35133
|
type: 'image_url',
|
|
35132
35134
|
image_url: {
|
|
35133
35135
|
url: `data:${imageData.mediaType};base64,${imageData.base64}`,
|
|
35136
|
+
detail: 'auto',
|
|
35134
35137
|
},
|
|
35135
35138
|
};
|
|
35136
35139
|
}
|
|
@@ -35146,16 +35149,27 @@ function buildImageContent(imageData, provider = 'unknown') {
|
|
|
35146
35149
|
};
|
|
35147
35150
|
}
|
|
35148
35151
|
else {
|
|
35149
|
-
// Unknown provider: try
|
|
35150
|
-
//
|
|
35152
|
+
// Unknown provider: try Anthropic format first as it's more explicit
|
|
35153
|
+
// Some plugins may translate this to OpenAI format internally
|
|
35151
35154
|
return {
|
|
35152
|
-
type: '
|
|
35153
|
-
|
|
35154
|
-
|
|
35155
|
+
type: 'image',
|
|
35156
|
+
source: {
|
|
35157
|
+
type: 'base64',
|
|
35158
|
+
media_type: imageData.mediaType,
|
|
35159
|
+
data: imageData.base64,
|
|
35155
35160
|
},
|
|
35156
35161
|
};
|
|
35157
35162
|
}
|
|
35158
35163
|
}
|
|
35164
|
+
/** Check if an error indicates vision/multimodal content is not supported */
|
|
35165
|
+
function isVisionNotSupportedError(error) {
|
|
35166
|
+
const errorStr = String(error);
|
|
35167
|
+
return (errorStr.includes('content.str') ||
|
|
35168
|
+
errorStr.includes('should be a valid string') ||
|
|
35169
|
+
errorStr.includes('Invalid content type') ||
|
|
35170
|
+
errorStr.includes('does not support vision') ||
|
|
35171
|
+
errorStr.includes('image_url') && errorStr.includes('not supported'));
|
|
35172
|
+
}
|
|
35159
35173
|
class SpatialReasoningEngine {
|
|
35160
35174
|
config;
|
|
35161
35175
|
console;
|
|
@@ -36186,72 +36200,114 @@ class TopologyDiscoveryEngine {
|
|
|
36186
36200
|
analysis.error = 'Failed to capture camera snapshot';
|
|
36187
36201
|
return analysis;
|
|
36188
36202
|
}
|
|
36189
|
-
|
|
36190
|
-
|
|
36191
|
-
|
|
36192
|
-
|
|
36193
|
-
|
|
36194
|
-
|
|
36195
|
-
|
|
36196
|
-
|
|
36197
|
-
|
|
36198
|
-
|
|
36199
|
-
|
|
36200
|
-
|
|
36201
|
-
|
|
36202
|
-
|
|
36203
|
-
|
|
36204
|
-
|
|
36205
|
-
|
|
36206
|
-
|
|
36207
|
-
|
|
36208
|
-
|
|
36209
|
-
|
|
36210
|
-
|
|
36211
|
-
|
|
36212
|
-
|
|
36213
|
-
|
|
36214
|
-
|
|
36215
|
-
|
|
36216
|
-
|
|
36217
|
-
|
|
36218
|
-
|
|
36219
|
-
|
|
36220
|
-
|
|
36221
|
-
|
|
36222
|
-
|
|
36223
|
-
|
|
36224
|
-
|
|
36225
|
-
|
|
36226
|
-
|
|
36227
|
-
|
|
36228
|
-
|
|
36229
|
-
|
|
36230
|
-
|
|
36231
|
-
|
|
36232
|
-
|
|
36233
|
-
|
|
36234
|
-
|
|
36235
|
-
|
|
36236
|
-
|
|
36237
|
-
|
|
36238
|
-
}
|
|
36203
|
+
// Try with detected provider format first, then fallback to alternate format
|
|
36204
|
+
const formatsToTry = [this.llmProviderType];
|
|
36205
|
+
// Add fallback format
|
|
36206
|
+
if (this.llmProviderType === 'openai') {
|
|
36207
|
+
formatsToTry.push('anthropic');
|
|
36208
|
+
}
|
|
36209
|
+
else if (this.llmProviderType === 'anthropic') {
|
|
36210
|
+
formatsToTry.push('openai');
|
|
36211
|
+
}
|
|
36212
|
+
else {
|
|
36213
|
+
// Unknown - try both
|
|
36214
|
+
formatsToTry.push('openai');
|
|
36215
|
+
}
|
|
36216
|
+
let lastError = null;
|
|
36217
|
+
for (const formatType of formatsToTry) {
|
|
36218
|
+
try {
|
|
36219
|
+
this.console.log(`[Discovery] Trying ${formatType} image format for ${cameraName}...`);
|
|
36220
|
+
// Build multimodal message with provider-specific image format
|
|
36221
|
+
const result = await llm.getChatCompletion({
|
|
36222
|
+
messages: [
|
|
36223
|
+
{
|
|
36224
|
+
role: 'user',
|
|
36225
|
+
content: [
|
|
36226
|
+
{ type: 'text', text: SCENE_ANALYSIS_PROMPT },
|
|
36227
|
+
(0, spatial_reasoning_1.buildImageContent)(imageData, formatType),
|
|
36228
|
+
],
|
|
36229
|
+
},
|
|
36230
|
+
],
|
|
36231
|
+
max_tokens: 500,
|
|
36232
|
+
temperature: 0.3,
|
|
36233
|
+
});
|
|
36234
|
+
const content = result?.choices?.[0]?.message?.content;
|
|
36235
|
+
if (content && typeof content === 'string') {
|
|
36236
|
+
try {
|
|
36237
|
+
// Extract JSON from response (handle markdown code blocks)
|
|
36238
|
+
let jsonStr = content.trim();
|
|
36239
|
+
if (jsonStr.startsWith('```')) {
|
|
36240
|
+
jsonStr = jsonStr.replace(/```json?\n?/g, '').replace(/```$/g, '').trim();
|
|
36241
|
+
}
|
|
36242
|
+
const parsed = JSON.parse(jsonStr);
|
|
36243
|
+
// Map parsed data to our types
|
|
36244
|
+
if (Array.isArray(parsed.landmarks)) {
|
|
36245
|
+
analysis.landmarks = parsed.landmarks.map((l) => ({
|
|
36246
|
+
name: l.name || 'Unknown',
|
|
36247
|
+
type: this.mapLandmarkType(l.type),
|
|
36248
|
+
confidence: typeof l.confidence === 'number' ? l.confidence : 0.7,
|
|
36249
|
+
description: l.description || '',
|
|
36250
|
+
boundingBox: l.boundingBox,
|
|
36251
|
+
}));
|
|
36252
|
+
}
|
|
36253
|
+
if (Array.isArray(parsed.zones)) {
|
|
36254
|
+
analysis.zones = parsed.zones.map((z) => ({
|
|
36255
|
+
name: z.name || 'Unknown',
|
|
36256
|
+
type: this.mapZoneType(z.type),
|
|
36257
|
+
coverage: typeof z.coverage === 'number' ? z.coverage : 0.5,
|
|
36258
|
+
description: z.description || '',
|
|
36259
|
+
boundingBox: z.boundingBox,
|
|
36260
|
+
}));
|
|
36261
|
+
}
|
|
36262
|
+
if (parsed.edges && typeof parsed.edges === 'object') {
|
|
36263
|
+
analysis.edges = {
|
|
36264
|
+
top: parsed.edges.top || '',
|
|
36265
|
+
left: parsed.edges.left || '',
|
|
36266
|
+
right: parsed.edges.right || '',
|
|
36267
|
+
bottom: parsed.edges.bottom || '',
|
|
36268
|
+
};
|
|
36269
|
+
}
|
|
36270
|
+
if (parsed.orientation) {
|
|
36271
|
+
analysis.orientation = this.mapOrientation(parsed.orientation);
|
|
36272
|
+
}
|
|
36273
|
+
analysis.isValid = true;
|
|
36274
|
+
this.console.log(`[Discovery] Analyzed ${cameraName}: ${analysis.landmarks.length} landmarks, ${analysis.zones.length} zones (using ${formatType} format)`);
|
|
36275
|
+
// Update the preferred format for future requests
|
|
36276
|
+
if (formatType !== this.llmProviderType) {
|
|
36277
|
+
this.console.log(`[Discovery] Switching to ${formatType} format for future requests`);
|
|
36278
|
+
this.llmProviderType = formatType;
|
|
36279
|
+
}
|
|
36280
|
+
// Success - exit the retry loop
|
|
36281
|
+
return analysis;
|
|
36239
36282
|
}
|
|
36240
|
-
|
|
36241
|
-
|
|
36283
|
+
catch (parseError) {
|
|
36284
|
+
this.console.warn(`[Discovery] Failed to parse LLM response for ${cameraName}:`, parseError);
|
|
36285
|
+
analysis.error = 'Failed to parse LLM response';
|
|
36286
|
+
return analysis;
|
|
36242
36287
|
}
|
|
36243
|
-
analysis.isValid = true;
|
|
36244
|
-
this.console.log(`[Discovery] Analyzed ${cameraName}: ${analysis.landmarks.length} landmarks, ${analysis.zones.length} zones`);
|
|
36245
36288
|
}
|
|
36246
|
-
|
|
36247
|
-
|
|
36248
|
-
|
|
36289
|
+
}
|
|
36290
|
+
catch (e) {
|
|
36291
|
+
lastError = e;
|
|
36292
|
+
// Check if this is a vision/multimodal format error
|
|
36293
|
+
if ((0, spatial_reasoning_1.isVisionNotSupportedError)(e)) {
|
|
36294
|
+
this.console.warn(`[Discovery] ${formatType} format not supported, trying fallback...`);
|
|
36295
|
+
continue; // Try next format
|
|
36249
36296
|
}
|
|
36297
|
+
// Not a format error - don't retry
|
|
36298
|
+
this.console.warn(`[Discovery] Scene analysis failed for ${cameraName}:`, e);
|
|
36299
|
+
break;
|
|
36250
36300
|
}
|
|
36251
36301
|
}
|
|
36252
|
-
|
|
36253
|
-
|
|
36254
|
-
|
|
36302
|
+
// All formats failed
|
|
36303
|
+
if (lastError) {
|
|
36304
|
+
const errorStr = String(lastError);
|
|
36305
|
+
if ((0, spatial_reasoning_1.isVisionNotSupportedError)(lastError)) {
|
|
36306
|
+
analysis.error = 'Vision/image analysis not supported by configured LLM. Ensure you have a vision-capable model (e.g., gpt-4o, gpt-4-turbo, claude-3-sonnet) configured.';
|
|
36307
|
+
}
|
|
36308
|
+
else {
|
|
36309
|
+
analysis.error = `Analysis failed: ${errorStr}`;
|
|
36310
|
+
}
|
|
36255
36311
|
}
|
|
36256
36312
|
// Cache the analysis
|
|
36257
36313
|
this.sceneCache.set(cameraId, analysis);
|