@blueharford/scrypted-spatial-awareness 0.5.1 → 0.5.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/main.nodejs.js +1 -1
- package/dist/main.nodejs.js.map +1 -1
- package/dist/plugin.zip +0 -0
- package/out/main.nodejs.js +101 -23
- package/out/main.nodejs.js.map +1 -1
- package/out/plugin.zip +0 -0
- package/package.json +1 -1
- package/src/core/spatial-reasoning.ts +81 -18
- package/src/core/topology-discovery.ts +33 -8
package/out/plugin.zip
CHANGED
|
Binary file
|
package/package.json
CHANGED
|
@@ -69,29 +69,77 @@ interface ChatCompletionDevice extends ScryptedDevice {
|
|
|
69
69
|
streamChatCompletion?(params: any): AsyncGenerator<any>;
|
|
70
70
|
}
|
|
71
71
|
|
|
72
|
+
/** Image data for LLM vision APIs */
|
|
73
|
+
export interface ImageData {
|
|
74
|
+
/** Raw base64 encoded image data (no data URL prefix) */
|
|
75
|
+
base64: string;
|
|
76
|
+
/** MIME type (e.g., 'image/jpeg') */
|
|
77
|
+
mediaType: string;
|
|
78
|
+
}
|
|
79
|
+
|
|
72
80
|
/**
|
|
73
|
-
* Convert a MediaObject to
|
|
81
|
+
* Convert a MediaObject to base64 image data for vision LLM consumption
|
|
74
82
|
* @param mediaObject - MediaObject from camera.takePicture()
|
|
75
|
-
* @returns
|
|
83
|
+
* @returns ImageData with raw base64 and media type, or null if conversion fails
|
|
76
84
|
*/
|
|
77
|
-
export async function mediaObjectToBase64(mediaObject: MediaObject): Promise<
|
|
85
|
+
export async function mediaObjectToBase64(mediaObject: MediaObject): Promise<ImageData | null> {
|
|
78
86
|
try {
|
|
79
87
|
// Convert MediaObject to Buffer using mediaManager
|
|
80
88
|
const buffer = await mediaManager.convertMediaObjectToBuffer(mediaObject, ScryptedMimeTypes.Image);
|
|
81
89
|
|
|
82
|
-
// Convert buffer to base64
|
|
90
|
+
// Convert buffer to base64 (raw, no data URL prefix)
|
|
83
91
|
const base64 = buffer.toString('base64');
|
|
84
92
|
|
|
85
93
|
// Determine MIME type - default to JPEG for camera images
|
|
86
|
-
const
|
|
94
|
+
const mediaType = mediaObject.mimeType?.split(';')[0] || 'image/jpeg';
|
|
87
95
|
|
|
88
|
-
return
|
|
96
|
+
return { base64, mediaType };
|
|
89
97
|
} catch (e) {
|
|
90
98
|
console.warn('Failed to convert MediaObject to base64:', e);
|
|
91
99
|
return null;
|
|
92
100
|
}
|
|
93
101
|
}
|
|
94
102
|
|
|
103
|
+
/** LLM Provider type for image format selection */
|
|
104
|
+
export type LlmProvider = 'openai' | 'anthropic' | 'unknown';
|
|
105
|
+
|
|
106
|
+
/**
|
|
107
|
+
* Build image content block for ChatCompletion API
|
|
108
|
+
* Supports both OpenAI and Anthropic formats
|
|
109
|
+
* @param imageData - Image data with base64 and media type
|
|
110
|
+
* @param provider - The LLM provider type (openai, anthropic, or unknown)
|
|
111
|
+
*/
|
|
112
|
+
export function buildImageContent(imageData: ImageData, provider: LlmProvider = 'unknown'): any {
|
|
113
|
+
if (provider === 'openai') {
|
|
114
|
+
// OpenAI format: uses data URL with image_url wrapper
|
|
115
|
+
return {
|
|
116
|
+
type: 'image_url',
|
|
117
|
+
image_url: {
|
|
118
|
+
url: `data:${imageData.mediaType};base64,${imageData.base64}`,
|
|
119
|
+
},
|
|
120
|
+
};
|
|
121
|
+
} else if (provider === 'anthropic') {
|
|
122
|
+
// Anthropic format: uses separate base64 data and media_type
|
|
123
|
+
return {
|
|
124
|
+
type: 'image',
|
|
125
|
+
source: {
|
|
126
|
+
type: 'base64',
|
|
127
|
+
media_type: imageData.mediaType,
|
|
128
|
+
data: imageData.base64,
|
|
129
|
+
},
|
|
130
|
+
};
|
|
131
|
+
} else {
|
|
132
|
+
// Unknown provider: try OpenAI format as it's more commonly supported
|
|
133
|
+
// Most LLM wrappers (including @scrypted/llm) understand the OpenAI format
|
|
134
|
+
return {
|
|
135
|
+
type: 'image_url',
|
|
136
|
+
image_url: {
|
|
137
|
+
url: `data:${imageData.mediaType};base64,${imageData.base64}`,
|
|
138
|
+
},
|
|
139
|
+
};
|
|
140
|
+
}
|
|
141
|
+
}
|
|
142
|
+
|
|
95
143
|
export class SpatialReasoningEngine {
|
|
96
144
|
private config: SpatialReasoningConfig;
|
|
97
145
|
private console: Console;
|
|
@@ -336,6 +384,7 @@ export class SpatialReasoningEngine {
|
|
|
336
384
|
|
|
337
385
|
private llmSearched: boolean = false;
|
|
338
386
|
private llmProvider: string | null = null;
|
|
387
|
+
private llmProviderType: LlmProvider = 'unknown';
|
|
339
388
|
|
|
340
389
|
/** Find or initialize LLM device - looks for ChatCompletion interface from @scrypted/llm plugin */
|
|
341
390
|
private async findLlmDevice(): Promise<ChatCompletionDevice | null> {
|
|
@@ -356,27 +405,36 @@ export class SpatialReasoningEngine {
|
|
|
356
405
|
const deviceName = device.name?.toLowerCase() || '';
|
|
357
406
|
const pluginId = (device as any).pluginId?.toLowerCase() || '';
|
|
358
407
|
|
|
359
|
-
// Identify the provider type for logging
|
|
408
|
+
// Identify the provider type for logging and image format selection
|
|
360
409
|
let providerType = 'Unknown';
|
|
361
|
-
|
|
362
|
-
|
|
363
|
-
}
|
|
410
|
+
let providerTypeEnum: LlmProvider = 'unknown';
|
|
411
|
+
|
|
364
412
|
if (deviceName.includes('openai') || deviceName.includes('gpt')) {
|
|
365
413
|
providerType = 'OpenAI';
|
|
414
|
+
providerTypeEnum = 'openai';
|
|
366
415
|
} else if (deviceName.includes('anthropic') || deviceName.includes('claude')) {
|
|
367
416
|
providerType = 'Anthropic';
|
|
417
|
+
providerTypeEnum = 'anthropic';
|
|
368
418
|
} else if (deviceName.includes('ollama')) {
|
|
369
419
|
providerType = 'Ollama';
|
|
420
|
+
providerTypeEnum = 'openai'; // Ollama uses OpenAI-compatible format
|
|
370
421
|
} else if (deviceName.includes('gemini') || deviceName.includes('google')) {
|
|
371
422
|
providerType = 'Google';
|
|
423
|
+
providerTypeEnum = 'openai'; // Google uses OpenAI-compatible format
|
|
372
424
|
} else if (deviceName.includes('llama')) {
|
|
373
425
|
providerType = 'llama.cpp';
|
|
426
|
+
providerTypeEnum = 'openai'; // llama.cpp uses OpenAI-compatible format
|
|
427
|
+
} else if (pluginId.includes('@scrypted/llm') || pluginId.includes('llm')) {
|
|
428
|
+
providerType = 'Scrypted LLM';
|
|
429
|
+
providerTypeEnum = 'unknown';
|
|
374
430
|
}
|
|
375
431
|
|
|
376
432
|
this.llmDevice = device as unknown as ChatCompletionDevice;
|
|
377
433
|
this.llmProvider = `${providerType} (${device.name})`;
|
|
434
|
+
this.llmProviderType = providerTypeEnum;
|
|
378
435
|
this.console.log(`[LLM] Connected to ${providerType}: ${device.name}`);
|
|
379
436
|
this.console.log(`[LLM] Plugin: ${pluginId || 'N/A'}`);
|
|
437
|
+
this.console.log(`[LLM] Image format: ${providerTypeEnum}`);
|
|
380
438
|
this.console.log(`[LLM] Interfaces: ${device.interfaces?.join(', ')}`);
|
|
381
439
|
return this.llmDevice;
|
|
382
440
|
}
|
|
@@ -398,6 +456,11 @@ export class SpatialReasoningEngine {
|
|
|
398
456
|
return this.llmProvider;
|
|
399
457
|
}
|
|
400
458
|
|
|
459
|
+
/** Get the current LLM provider type for image format selection */
|
|
460
|
+
getLlmProviderType(): LlmProvider {
|
|
461
|
+
return this.llmProviderType;
|
|
462
|
+
}
|
|
463
|
+
|
|
401
464
|
/** Check if LLM is available */
|
|
402
465
|
isLlmAvailable(): boolean {
|
|
403
466
|
return this.llmDevice !== null;
|
|
@@ -751,7 +814,7 @@ export class SpatialReasoningEngine {
|
|
|
751
814
|
|
|
752
815
|
try {
|
|
753
816
|
// Convert image to base64 for vision LLM
|
|
754
|
-
const
|
|
817
|
+
const imageData = await mediaObjectToBase64(mediaObject);
|
|
755
818
|
|
|
756
819
|
// Retrieve relevant context for RAG
|
|
757
820
|
const relevantChunks = this.retrieveRelevantContext(
|
|
@@ -775,11 +838,11 @@ export class SpatialReasoningEngine {
|
|
|
775
838
|
|
|
776
839
|
// Build message content - use multimodal format if we have an image
|
|
777
840
|
let messageContent: any;
|
|
778
|
-
if (
|
|
779
|
-
// Vision-capable multimodal message format (
|
|
841
|
+
if (imageData) {
|
|
842
|
+
// Vision-capable multimodal message format (provider-specific)
|
|
780
843
|
messageContent = [
|
|
781
844
|
{ type: 'text', text: prompt },
|
|
782
|
-
|
|
845
|
+
buildImageContent(imageData, this.llmProviderType),
|
|
783
846
|
];
|
|
784
847
|
} else {
|
|
785
848
|
// Fallback to text-only if image conversion failed
|
|
@@ -863,7 +926,7 @@ Generate ONLY the description, nothing else:`;
|
|
|
863
926
|
|
|
864
927
|
try {
|
|
865
928
|
// Convert image to base64 for vision LLM
|
|
866
|
-
const
|
|
929
|
+
const imageData = await mediaObjectToBase64(mediaObject);
|
|
867
930
|
|
|
868
931
|
const prompt = `Analyze this security camera image. A ${objectClass} was detected.
|
|
869
932
|
|
|
@@ -880,11 +943,11 @@ If no clear landmark is identifiable, respond with: {"name": null}`;
|
|
|
880
943
|
|
|
881
944
|
// Build message content - use multimodal format if we have an image
|
|
882
945
|
let messageContent: any;
|
|
883
|
-
if (
|
|
884
|
-
// Vision-capable multimodal message format (
|
|
946
|
+
if (imageData) {
|
|
947
|
+
// Vision-capable multimodal message format (provider-specific)
|
|
885
948
|
messageContent = [
|
|
886
949
|
{ type: 'text', text: prompt },
|
|
887
|
-
|
|
950
|
+
buildImageContent(imageData, this.llmProviderType),
|
|
888
951
|
];
|
|
889
952
|
} else {
|
|
890
953
|
// Fallback to text-only if image conversion failed
|
|
@@ -30,7 +30,7 @@ import {
|
|
|
30
30
|
Landmark,
|
|
31
31
|
findCamera,
|
|
32
32
|
} from '../models/topology';
|
|
33
|
-
import { mediaObjectToBase64 } from './spatial-reasoning';
|
|
33
|
+
import { mediaObjectToBase64, buildImageContent, ImageData, LlmProvider } from './spatial-reasoning';
|
|
34
34
|
|
|
35
35
|
const { systemManager } = sdk;
|
|
36
36
|
|
|
@@ -100,6 +100,7 @@ export class TopologyDiscoveryEngine {
|
|
|
100
100
|
private topology: CameraTopology | null = null;
|
|
101
101
|
private llmDevice: ChatCompletionDevice | null = null;
|
|
102
102
|
private llmSearched: boolean = false;
|
|
103
|
+
private llmProviderType: LlmProvider = 'unknown';
|
|
103
104
|
|
|
104
105
|
// Scene analysis cache (camera ID -> analysis)
|
|
105
106
|
private sceneCache: Map<string, SceneAnalysis> = new Map();
|
|
@@ -177,8 +178,24 @@ export class TopologyDiscoveryEngine {
|
|
|
177
178
|
if (!device) continue;
|
|
178
179
|
|
|
179
180
|
if (device.interfaces?.includes('ChatCompletion')) {
|
|
181
|
+
const deviceName = device.name?.toLowerCase() || '';
|
|
182
|
+
|
|
183
|
+
// Detect provider type for image format selection
|
|
184
|
+
if (deviceName.includes('openai') || deviceName.includes('gpt')) {
|
|
185
|
+
this.llmProviderType = 'openai';
|
|
186
|
+
} else if (deviceName.includes('anthropic') || deviceName.includes('claude')) {
|
|
187
|
+
this.llmProviderType = 'anthropic';
|
|
188
|
+
} else if (deviceName.includes('ollama') || deviceName.includes('gemini') ||
|
|
189
|
+
deviceName.includes('google') || deviceName.includes('llama')) {
|
|
190
|
+
// These providers use OpenAI-compatible format
|
|
191
|
+
this.llmProviderType = 'openai';
|
|
192
|
+
} else {
|
|
193
|
+
this.llmProviderType = 'unknown';
|
|
194
|
+
}
|
|
195
|
+
|
|
180
196
|
this.llmDevice = device as unknown as ChatCompletionDevice;
|
|
181
197
|
this.console.log(`[Discovery] Connected to LLM: ${device.name}`);
|
|
198
|
+
this.console.log(`[Discovery] Image format: ${this.llmProviderType}`);
|
|
182
199
|
return this.llmDevice;
|
|
183
200
|
}
|
|
184
201
|
}
|
|
@@ -191,8 +208,8 @@ export class TopologyDiscoveryEngine {
|
|
|
191
208
|
return null;
|
|
192
209
|
}
|
|
193
210
|
|
|
194
|
-
/** Get camera snapshot as
|
|
195
|
-
private async getCameraSnapshot(cameraId: string): Promise<
|
|
211
|
+
/** Get camera snapshot as ImageData */
|
|
212
|
+
private async getCameraSnapshot(cameraId: string): Promise<ImageData | null> {
|
|
196
213
|
try {
|
|
197
214
|
const camera = systemManager.getDeviceById<Camera>(cameraId);
|
|
198
215
|
if (!camera?.interfaces?.includes(ScryptedInterface.Camera)) {
|
|
@@ -230,21 +247,21 @@ export class TopologyDiscoveryEngine {
|
|
|
230
247
|
return analysis;
|
|
231
248
|
}
|
|
232
249
|
|
|
233
|
-
const
|
|
234
|
-
if (!
|
|
250
|
+
const imageData = await this.getCameraSnapshot(cameraId);
|
|
251
|
+
if (!imageData) {
|
|
235
252
|
analysis.error = 'Failed to capture camera snapshot';
|
|
236
253
|
return analysis;
|
|
237
254
|
}
|
|
238
255
|
|
|
239
256
|
try {
|
|
240
|
-
// Build multimodal message
|
|
257
|
+
// Build multimodal message with provider-specific image format
|
|
241
258
|
const result = await llm.getChatCompletion({
|
|
242
259
|
messages: [
|
|
243
260
|
{
|
|
244
261
|
role: 'user',
|
|
245
262
|
content: [
|
|
246
263
|
{ type: 'text', text: SCENE_ANALYSIS_PROMPT },
|
|
247
|
-
|
|
264
|
+
buildImageContent(imageData, this.llmProviderType),
|
|
248
265
|
],
|
|
249
266
|
},
|
|
250
267
|
],
|
|
@@ -387,6 +404,14 @@ export class TopologyDiscoveryEngine {
|
|
|
387
404
|
this.status.camerasAnalyzed = analyses.length;
|
|
388
405
|
this.console.log(`[Discovery] Analyzed ${analyses.length} cameras successfully`);
|
|
389
406
|
|
|
407
|
+
// Handle case where no cameras were successfully analyzed
|
|
408
|
+
if (analyses.length === 0) {
|
|
409
|
+
this.console.warn('[Discovery] No cameras were successfully analyzed');
|
|
410
|
+
this.status.lastError = 'No cameras were successfully analyzed - check LLM configuration';
|
|
411
|
+
this.status.lastScanTime = Date.now();
|
|
412
|
+
return null;
|
|
413
|
+
}
|
|
414
|
+
|
|
390
415
|
// Correlate if we have multiple cameras
|
|
391
416
|
let correlation: TopologyCorrelation | null = null;
|
|
392
417
|
if (analyses.length >= 2) {
|
|
@@ -394,7 +419,7 @@ export class TopologyDiscoveryEngine {
|
|
|
394
419
|
if (correlation) {
|
|
395
420
|
this.generateSuggestionsFromCorrelation(correlation);
|
|
396
421
|
}
|
|
397
|
-
} else {
|
|
422
|
+
} else if (analyses.length === 1) {
|
|
398
423
|
// Single camera - generate suggestions from its analysis
|
|
399
424
|
this.generateSuggestionsFromAnalysis(analyses[0]);
|
|
400
425
|
}
|