@blueharford/scrypted-spatial-awareness 0.5.2 → 0.5.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/main.nodejs.js +1 -1
- package/dist/main.nodejs.js.map +1 -1
- package/dist/plugin.zip +0 -0
- package/out/main.nodejs.js +190 -80
- package/out/main.nodejs.js.map +1 -1
- package/out/plugin.zip +0 -0
- package/package.json +1 -1
- package/src/core/spatial-reasoning.ts +74 -20
- package/src/core/topology-discovery.ts +131 -66
package/dist/plugin.zip
CHANGED
|
Binary file
|
package/out/main.nodejs.js
CHANGED
|
@@ -35095,6 +35095,7 @@ Object.defineProperty(exports, "__esModule", ({ value: true }));
|
|
|
35095
35095
|
exports.SpatialReasoningEngine = void 0;
|
|
35096
35096
|
exports.mediaObjectToBase64 = mediaObjectToBase64;
|
|
35097
35097
|
exports.buildImageContent = buildImageContent;
|
|
35098
|
+
exports.isVisionNotSupportedError = isVisionNotSupportedError;
|
|
35098
35099
|
const sdk_1 = __importStar(__webpack_require__(/*! @scrypted/sdk */ "./node_modules/@scrypted/sdk/dist/src/index.js"));
|
|
35099
35100
|
const topology_1 = __webpack_require__(/*! ../models/topology */ "./src/models/topology.ts");
|
|
35100
35101
|
const { systemManager, mediaManager } = sdk_1.default;
|
|
@@ -35120,19 +35121,54 @@ async function mediaObjectToBase64(mediaObject) {
|
|
|
35120
35121
|
}
|
|
35121
35122
|
/**
|
|
35122
35123
|
* Build image content block for ChatCompletion API
|
|
35123
|
-
*
|
|
35124
|
+
* Supports both OpenAI and Anthropic formats
|
|
35125
|
+
* @param imageData - Image data with base64 and media type
|
|
35126
|
+
* @param provider - The LLM provider type (openai, anthropic, or unknown)
|
|
35124
35127
|
*/
|
|
35125
|
-
function buildImageContent(imageData) {
|
|
35126
|
-
|
|
35127
|
-
|
|
35128
|
-
|
|
35129
|
-
|
|
35130
|
-
|
|
35131
|
-
|
|
35132
|
-
|
|
35133
|
-
|
|
35134
|
-
|
|
35135
|
-
|
|
35128
|
+
function buildImageContent(imageData, provider = 'unknown') {
|
|
35129
|
+
if (provider === 'openai') {
|
|
35130
|
+
// OpenAI format: uses data URL with image_url wrapper
|
|
35131
|
+
// Include detail parameter for compatibility
|
|
35132
|
+
return {
|
|
35133
|
+
type: 'image_url',
|
|
35134
|
+
image_url: {
|
|
35135
|
+
url: `data:${imageData.mediaType};base64,${imageData.base64}`,
|
|
35136
|
+
detail: 'auto',
|
|
35137
|
+
},
|
|
35138
|
+
};
|
|
35139
|
+
}
|
|
35140
|
+
else if (provider === 'anthropic') {
|
|
35141
|
+
// Anthropic format: uses separate base64 data and media_type
|
|
35142
|
+
return {
|
|
35143
|
+
type: 'image',
|
|
35144
|
+
source: {
|
|
35145
|
+
type: 'base64',
|
|
35146
|
+
media_type: imageData.mediaType,
|
|
35147
|
+
data: imageData.base64,
|
|
35148
|
+
},
|
|
35149
|
+
};
|
|
35150
|
+
}
|
|
35151
|
+
else {
|
|
35152
|
+
// Unknown provider: try Anthropic format first as it's more explicit
|
|
35153
|
+
// Some plugins may translate this to OpenAI format internally
|
|
35154
|
+
return {
|
|
35155
|
+
type: 'image',
|
|
35156
|
+
source: {
|
|
35157
|
+
type: 'base64',
|
|
35158
|
+
media_type: imageData.mediaType,
|
|
35159
|
+
data: imageData.base64,
|
|
35160
|
+
},
|
|
35161
|
+
};
|
|
35162
|
+
}
|
|
35163
|
+
}
|
|
35164
|
+
/** Check if an error indicates vision/multimodal content is not supported */
|
|
35165
|
+
function isVisionNotSupportedError(error) {
|
|
35166
|
+
const errorStr = String(error);
|
|
35167
|
+
return (errorStr.includes('content.str') ||
|
|
35168
|
+
errorStr.includes('should be a valid string') ||
|
|
35169
|
+
errorStr.includes('Invalid content type') ||
|
|
35170
|
+
errorStr.includes('does not support vision') ||
|
|
35171
|
+
errorStr.includes('image_url') && errorStr.includes('not supported'));
|
|
35136
35172
|
}
|
|
35137
35173
|
class SpatialReasoningEngine {
|
|
35138
35174
|
config;
|
|
@@ -35353,6 +35389,7 @@ class SpatialReasoningEngine {
|
|
|
35353
35389
|
}
|
|
35354
35390
|
llmSearched = false;
|
|
35355
35391
|
llmProvider = null;
|
|
35392
|
+
llmProviderType = 'unknown';
|
|
35356
35393
|
/** Find or initialize LLM device - looks for ChatCompletion interface from @scrypted/llm plugin */
|
|
35357
35394
|
async findLlmDevice() {
|
|
35358
35395
|
if (this.llmDevice)
|
|
@@ -35371,30 +35408,39 @@ class SpatialReasoningEngine {
|
|
|
35371
35408
|
if (device.interfaces?.includes('ChatCompletion')) {
|
|
35372
35409
|
const deviceName = device.name?.toLowerCase() || '';
|
|
35373
35410
|
const pluginId = device.pluginId?.toLowerCase() || '';
|
|
35374
|
-
// Identify the provider type for logging
|
|
35411
|
+
// Identify the provider type for logging and image format selection
|
|
35375
35412
|
let providerType = 'Unknown';
|
|
35376
|
-
|
|
35377
|
-
providerType = 'Scrypted LLM';
|
|
35378
|
-
}
|
|
35413
|
+
let providerTypeEnum = 'unknown';
|
|
35379
35414
|
if (deviceName.includes('openai') || deviceName.includes('gpt')) {
|
|
35380
35415
|
providerType = 'OpenAI';
|
|
35416
|
+
providerTypeEnum = 'openai';
|
|
35381
35417
|
}
|
|
35382
35418
|
else if (deviceName.includes('anthropic') || deviceName.includes('claude')) {
|
|
35383
35419
|
providerType = 'Anthropic';
|
|
35420
|
+
providerTypeEnum = 'anthropic';
|
|
35384
35421
|
}
|
|
35385
35422
|
else if (deviceName.includes('ollama')) {
|
|
35386
35423
|
providerType = 'Ollama';
|
|
35424
|
+
providerTypeEnum = 'openai'; // Ollama uses OpenAI-compatible format
|
|
35387
35425
|
}
|
|
35388
35426
|
else if (deviceName.includes('gemini') || deviceName.includes('google')) {
|
|
35389
35427
|
providerType = 'Google';
|
|
35428
|
+
providerTypeEnum = 'openai'; // Google uses OpenAI-compatible format
|
|
35390
35429
|
}
|
|
35391
35430
|
else if (deviceName.includes('llama')) {
|
|
35392
35431
|
providerType = 'llama.cpp';
|
|
35432
|
+
providerTypeEnum = 'openai'; // llama.cpp uses OpenAI-compatible format
|
|
35433
|
+
}
|
|
35434
|
+
else if (pluginId.includes('@scrypted/llm') || pluginId.includes('llm')) {
|
|
35435
|
+
providerType = 'Scrypted LLM';
|
|
35436
|
+
providerTypeEnum = 'unknown';
|
|
35393
35437
|
}
|
|
35394
35438
|
this.llmDevice = device;
|
|
35395
35439
|
this.llmProvider = `${providerType} (${device.name})`;
|
|
35440
|
+
this.llmProviderType = providerTypeEnum;
|
|
35396
35441
|
this.console.log(`[LLM] Connected to ${providerType}: ${device.name}`);
|
|
35397
35442
|
this.console.log(`[LLM] Plugin: ${pluginId || 'N/A'}`);
|
|
35443
|
+
this.console.log(`[LLM] Image format: ${providerTypeEnum}`);
|
|
35398
35444
|
this.console.log(`[LLM] Interfaces: ${device.interfaces?.join(', ')}`);
|
|
35399
35445
|
return this.llmDevice;
|
|
35400
35446
|
}
|
|
@@ -35412,6 +35458,10 @@ class SpatialReasoningEngine {
|
|
|
35412
35458
|
getLlmProvider() {
|
|
35413
35459
|
return this.llmProvider;
|
|
35414
35460
|
}
|
|
35461
|
+
/** Get the current LLM provider type for image format selection */
|
|
35462
|
+
getLlmProviderType() {
|
|
35463
|
+
return this.llmProviderType;
|
|
35464
|
+
}
|
|
35415
35465
|
/** Check if LLM is available */
|
|
35416
35466
|
isLlmAvailable() {
|
|
35417
35467
|
return this.llmDevice !== null;
|
|
@@ -35688,10 +35738,10 @@ class SpatialReasoningEngine {
|
|
|
35688
35738
|
// Build message content - use multimodal format if we have an image
|
|
35689
35739
|
let messageContent;
|
|
35690
35740
|
if (imageData) {
|
|
35691
|
-
// Vision-capable multimodal message format (
|
|
35741
|
+
// Vision-capable multimodal message format (provider-specific)
|
|
35692
35742
|
messageContent = [
|
|
35693
35743
|
{ type: 'text', text: prompt },
|
|
35694
|
-
buildImageContent(imageData),
|
|
35744
|
+
buildImageContent(imageData, this.llmProviderType),
|
|
35695
35745
|
];
|
|
35696
35746
|
}
|
|
35697
35747
|
else {
|
|
@@ -35774,10 +35824,10 @@ If no clear landmark is identifiable, respond with: {"name": null}`;
|
|
|
35774
35824
|
// Build message content - use multimodal format if we have an image
|
|
35775
35825
|
let messageContent;
|
|
35776
35826
|
if (imageData) {
|
|
35777
|
-
// Vision-capable multimodal message format (
|
|
35827
|
+
// Vision-capable multimodal message format (provider-specific)
|
|
35778
35828
|
messageContent = [
|
|
35779
35829
|
{ type: 'text', text: prompt },
|
|
35780
|
-
buildImageContent(imageData),
|
|
35830
|
+
buildImageContent(imageData, this.llmProviderType),
|
|
35781
35831
|
];
|
|
35782
35832
|
}
|
|
35783
35833
|
else {
|
|
@@ -36017,6 +36067,7 @@ class TopologyDiscoveryEngine {
|
|
|
36017
36067
|
topology = null;
|
|
36018
36068
|
llmDevice = null;
|
|
36019
36069
|
llmSearched = false;
|
|
36070
|
+
llmProviderType = 'unknown';
|
|
36020
36071
|
// Scene analysis cache (camera ID -> analysis)
|
|
36021
36072
|
sceneCache = new Map();
|
|
36022
36073
|
// Pending suggestions for user review
|
|
@@ -36080,8 +36131,25 @@ class TopologyDiscoveryEngine {
|
|
|
36080
36131
|
if (!device)
|
|
36081
36132
|
continue;
|
|
36082
36133
|
if (device.interfaces?.includes('ChatCompletion')) {
|
|
36134
|
+
const deviceName = device.name?.toLowerCase() || '';
|
|
36135
|
+
// Detect provider type for image format selection
|
|
36136
|
+
if (deviceName.includes('openai') || deviceName.includes('gpt')) {
|
|
36137
|
+
this.llmProviderType = 'openai';
|
|
36138
|
+
}
|
|
36139
|
+
else if (deviceName.includes('anthropic') || deviceName.includes('claude')) {
|
|
36140
|
+
this.llmProviderType = 'anthropic';
|
|
36141
|
+
}
|
|
36142
|
+
else if (deviceName.includes('ollama') || deviceName.includes('gemini') ||
|
|
36143
|
+
deviceName.includes('google') || deviceName.includes('llama')) {
|
|
36144
|
+
// These providers use OpenAI-compatible format
|
|
36145
|
+
this.llmProviderType = 'openai';
|
|
36146
|
+
}
|
|
36147
|
+
else {
|
|
36148
|
+
this.llmProviderType = 'unknown';
|
|
36149
|
+
}
|
|
36083
36150
|
this.llmDevice = device;
|
|
36084
36151
|
this.console.log(`[Discovery] Connected to LLM: ${device.name}`);
|
|
36152
|
+
this.console.log(`[Discovery] Image format: ${this.llmProviderType}`);
|
|
36085
36153
|
return this.llmDevice;
|
|
36086
36154
|
}
|
|
36087
36155
|
}
|
|
@@ -36132,72 +36200,114 @@ class TopologyDiscoveryEngine {
|
|
|
36132
36200
|
analysis.error = 'Failed to capture camera snapshot';
|
|
36133
36201
|
return analysis;
|
|
36134
36202
|
}
|
|
36135
|
-
|
|
36136
|
-
|
|
36137
|
-
|
|
36138
|
-
|
|
36139
|
-
|
|
36140
|
-
|
|
36141
|
-
|
|
36142
|
-
|
|
36143
|
-
|
|
36144
|
-
|
|
36145
|
-
|
|
36146
|
-
|
|
36147
|
-
|
|
36148
|
-
|
|
36149
|
-
|
|
36150
|
-
|
|
36151
|
-
|
|
36152
|
-
|
|
36153
|
-
|
|
36154
|
-
|
|
36155
|
-
|
|
36156
|
-
|
|
36157
|
-
|
|
36158
|
-
|
|
36159
|
-
|
|
36160
|
-
|
|
36161
|
-
|
|
36162
|
-
|
|
36163
|
-
|
|
36164
|
-
|
|
36165
|
-
|
|
36166
|
-
|
|
36167
|
-
|
|
36168
|
-
|
|
36169
|
-
|
|
36170
|
-
|
|
36171
|
-
|
|
36172
|
-
|
|
36173
|
-
|
|
36174
|
-
|
|
36175
|
-
|
|
36176
|
-
|
|
36177
|
-
|
|
36178
|
-
|
|
36179
|
-
|
|
36180
|
-
|
|
36181
|
-
|
|
36182
|
-
|
|
36183
|
-
|
|
36184
|
-
}
|
|
36203
|
+
// Try with detected provider format first, then fallback to alternate format
|
|
36204
|
+
const formatsToTry = [this.llmProviderType];
|
|
36205
|
+
// Add fallback format
|
|
36206
|
+
if (this.llmProviderType === 'openai') {
|
|
36207
|
+
formatsToTry.push('anthropic');
|
|
36208
|
+
}
|
|
36209
|
+
else if (this.llmProviderType === 'anthropic') {
|
|
36210
|
+
formatsToTry.push('openai');
|
|
36211
|
+
}
|
|
36212
|
+
else {
|
|
36213
|
+
// Unknown - try both
|
|
36214
|
+
formatsToTry.push('openai');
|
|
36215
|
+
}
|
|
36216
|
+
let lastError = null;
|
|
36217
|
+
for (const formatType of formatsToTry) {
|
|
36218
|
+
try {
|
|
36219
|
+
this.console.log(`[Discovery] Trying ${formatType} image format for ${cameraName}...`);
|
|
36220
|
+
// Build multimodal message with provider-specific image format
|
|
36221
|
+
const result = await llm.getChatCompletion({
|
|
36222
|
+
messages: [
|
|
36223
|
+
{
|
|
36224
|
+
role: 'user',
|
|
36225
|
+
content: [
|
|
36226
|
+
{ type: 'text', text: SCENE_ANALYSIS_PROMPT },
|
|
36227
|
+
(0, spatial_reasoning_1.buildImageContent)(imageData, formatType),
|
|
36228
|
+
],
|
|
36229
|
+
},
|
|
36230
|
+
],
|
|
36231
|
+
max_tokens: 500,
|
|
36232
|
+
temperature: 0.3,
|
|
36233
|
+
});
|
|
36234
|
+
const content = result?.choices?.[0]?.message?.content;
|
|
36235
|
+
if (content && typeof content === 'string') {
|
|
36236
|
+
try {
|
|
36237
|
+
// Extract JSON from response (handle markdown code blocks)
|
|
36238
|
+
let jsonStr = content.trim();
|
|
36239
|
+
if (jsonStr.startsWith('```')) {
|
|
36240
|
+
jsonStr = jsonStr.replace(/```json?\n?/g, '').replace(/```$/g, '').trim();
|
|
36241
|
+
}
|
|
36242
|
+
const parsed = JSON.parse(jsonStr);
|
|
36243
|
+
// Map parsed data to our types
|
|
36244
|
+
if (Array.isArray(parsed.landmarks)) {
|
|
36245
|
+
analysis.landmarks = parsed.landmarks.map((l) => ({
|
|
36246
|
+
name: l.name || 'Unknown',
|
|
36247
|
+
type: this.mapLandmarkType(l.type),
|
|
36248
|
+
confidence: typeof l.confidence === 'number' ? l.confidence : 0.7,
|
|
36249
|
+
description: l.description || '',
|
|
36250
|
+
boundingBox: l.boundingBox,
|
|
36251
|
+
}));
|
|
36252
|
+
}
|
|
36253
|
+
if (Array.isArray(parsed.zones)) {
|
|
36254
|
+
analysis.zones = parsed.zones.map((z) => ({
|
|
36255
|
+
name: z.name || 'Unknown',
|
|
36256
|
+
type: this.mapZoneType(z.type),
|
|
36257
|
+
coverage: typeof z.coverage === 'number' ? z.coverage : 0.5,
|
|
36258
|
+
description: z.description || '',
|
|
36259
|
+
boundingBox: z.boundingBox,
|
|
36260
|
+
}));
|
|
36261
|
+
}
|
|
36262
|
+
if (parsed.edges && typeof parsed.edges === 'object') {
|
|
36263
|
+
analysis.edges = {
|
|
36264
|
+
top: parsed.edges.top || '',
|
|
36265
|
+
left: parsed.edges.left || '',
|
|
36266
|
+
right: parsed.edges.right || '',
|
|
36267
|
+
bottom: parsed.edges.bottom || '',
|
|
36268
|
+
};
|
|
36269
|
+
}
|
|
36270
|
+
if (parsed.orientation) {
|
|
36271
|
+
analysis.orientation = this.mapOrientation(parsed.orientation);
|
|
36272
|
+
}
|
|
36273
|
+
analysis.isValid = true;
|
|
36274
|
+
this.console.log(`[Discovery] Analyzed ${cameraName}: ${analysis.landmarks.length} landmarks, ${analysis.zones.length} zones (using ${formatType} format)`);
|
|
36275
|
+
// Update the preferred format for future requests
|
|
36276
|
+
if (formatType !== this.llmProviderType) {
|
|
36277
|
+
this.console.log(`[Discovery] Switching to ${formatType} format for future requests`);
|
|
36278
|
+
this.llmProviderType = formatType;
|
|
36279
|
+
}
|
|
36280
|
+
// Success - exit the retry loop
|
|
36281
|
+
return analysis;
|
|
36185
36282
|
}
|
|
36186
|
-
|
|
36187
|
-
|
|
36283
|
+
catch (parseError) {
|
|
36284
|
+
this.console.warn(`[Discovery] Failed to parse LLM response for ${cameraName}:`, parseError);
|
|
36285
|
+
analysis.error = 'Failed to parse LLM response';
|
|
36286
|
+
return analysis;
|
|
36188
36287
|
}
|
|
36189
|
-
analysis.isValid = true;
|
|
36190
|
-
this.console.log(`[Discovery] Analyzed ${cameraName}: ${analysis.landmarks.length} landmarks, ${analysis.zones.length} zones`);
|
|
36191
36288
|
}
|
|
36192
|
-
|
|
36193
|
-
|
|
36194
|
-
|
|
36289
|
+
}
|
|
36290
|
+
catch (e) {
|
|
36291
|
+
lastError = e;
|
|
36292
|
+
// Check if this is a vision/multimodal format error
|
|
36293
|
+
if ((0, spatial_reasoning_1.isVisionNotSupportedError)(e)) {
|
|
36294
|
+
this.console.warn(`[Discovery] ${formatType} format not supported, trying fallback...`);
|
|
36295
|
+
continue; // Try next format
|
|
36195
36296
|
}
|
|
36297
|
+
// Not a format error - don't retry
|
|
36298
|
+
this.console.warn(`[Discovery] Scene analysis failed for ${cameraName}:`, e);
|
|
36299
|
+
break;
|
|
36196
36300
|
}
|
|
36197
36301
|
}
|
|
36198
|
-
|
|
36199
|
-
|
|
36200
|
-
|
|
36302
|
+
// All formats failed
|
|
36303
|
+
if (lastError) {
|
|
36304
|
+
const errorStr = String(lastError);
|
|
36305
|
+
if ((0, spatial_reasoning_1.isVisionNotSupportedError)(lastError)) {
|
|
36306
|
+
analysis.error = 'Vision/image analysis not supported by configured LLM. Ensure you have a vision-capable model (e.g., gpt-4o, gpt-4-turbo, claude-3-sonnet) configured.';
|
|
36307
|
+
}
|
|
36308
|
+
else {
|
|
36309
|
+
analysis.error = `Analysis failed: ${errorStr}`;
|
|
36310
|
+
}
|
|
36201
36311
|
}
|
|
36202
36312
|
// Cache the analysis
|
|
36203
36313
|
this.sceneCache.set(cameraId, analysis);
|