@blueharford/scrypted-spatial-awareness 0.5.0 → 0.5.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/main.nodejs.js +3 -0
- package/dist/main.nodejs.js.LICENSE.txt +1 -0
- package/dist/main.nodejs.js.map +1 -0
- package/dist/plugin.zip +0 -0
- package/out/main.nodejs.js +44263 -0
- package/out/main.nodejs.js.map +1 -0
- package/out/plugin.zip +0 -0
- package/package.json +1 -1
- package/src/core/spatial-reasoning.ts +39 -14
- package/src/core/topology-discovery.ts +16 -8
- package/.vscode/settings.json +0 -3
package/out/plugin.zip
ADDED
|
Binary file
|
package/package.json
CHANGED
|
@@ -69,29 +69,54 @@ interface ChatCompletionDevice extends ScryptedDevice {
|
|
|
69
69
|
streamChatCompletion?(params: any): AsyncGenerator<any>;
|
|
70
70
|
}
|
|
71
71
|
|
|
72
|
+
/** Image data for LLM vision APIs */
|
|
73
|
+
export interface ImageData {
|
|
74
|
+
/** Raw base64 encoded image data (no data URL prefix) */
|
|
75
|
+
base64: string;
|
|
76
|
+
/** MIME type (e.g., 'image/jpeg') */
|
|
77
|
+
mediaType: string;
|
|
78
|
+
}
|
|
79
|
+
|
|
72
80
|
/**
|
|
73
|
-
* Convert a MediaObject to
|
|
81
|
+
* Convert a MediaObject to base64 image data for vision LLM consumption
|
|
74
82
|
* @param mediaObject - MediaObject from camera.takePicture()
|
|
75
|
-
* @returns
|
|
83
|
+
* @returns ImageData with raw base64 and media type, or null if conversion fails
|
|
76
84
|
*/
|
|
77
|
-
export async function mediaObjectToBase64(mediaObject: MediaObject): Promise<
|
|
85
|
+
export async function mediaObjectToBase64(mediaObject: MediaObject): Promise<ImageData | null> {
|
|
78
86
|
try {
|
|
79
87
|
// Convert MediaObject to Buffer using mediaManager
|
|
80
88
|
const buffer = await mediaManager.convertMediaObjectToBuffer(mediaObject, ScryptedMimeTypes.Image);
|
|
81
89
|
|
|
82
|
-
// Convert buffer to base64
|
|
90
|
+
// Convert buffer to base64 (raw, no data URL prefix)
|
|
83
91
|
const base64 = buffer.toString('base64');
|
|
84
92
|
|
|
85
93
|
// Determine MIME type - default to JPEG for camera images
|
|
86
|
-
const
|
|
94
|
+
const mediaType = mediaObject.mimeType?.split(';')[0] || 'image/jpeg';
|
|
87
95
|
|
|
88
|
-
return
|
|
96
|
+
return { base64, mediaType };
|
|
89
97
|
} catch (e) {
|
|
90
98
|
console.warn('Failed to convert MediaObject to base64:', e);
|
|
91
99
|
return null;
|
|
92
100
|
}
|
|
93
101
|
}
|
|
94
102
|
|
|
103
|
+
/**
|
|
104
|
+
* Build image content block for ChatCompletion API
|
|
105
|
+
* Compatible with both OpenAI and Anthropic formats via @scrypted/llm
|
|
106
|
+
*/
|
|
107
|
+
export function buildImageContent(imageData: ImageData): any {
|
|
108
|
+
// Use Anthropic's native format which @scrypted/llm should translate
|
|
109
|
+
// This format is more explicit about the base64 data
|
|
110
|
+
return {
|
|
111
|
+
type: 'image',
|
|
112
|
+
source: {
|
|
113
|
+
type: 'base64',
|
|
114
|
+
media_type: imageData.mediaType,
|
|
115
|
+
data: imageData.base64,
|
|
116
|
+
},
|
|
117
|
+
};
|
|
118
|
+
}
|
|
119
|
+
|
|
95
120
|
export class SpatialReasoningEngine {
|
|
96
121
|
private config: SpatialReasoningConfig;
|
|
97
122
|
private console: Console;
|
|
@@ -751,7 +776,7 @@ export class SpatialReasoningEngine {
|
|
|
751
776
|
|
|
752
777
|
try {
|
|
753
778
|
// Convert image to base64 for vision LLM
|
|
754
|
-
const
|
|
779
|
+
const imageData = await mediaObjectToBase64(mediaObject);
|
|
755
780
|
|
|
756
781
|
// Retrieve relevant context for RAG
|
|
757
782
|
const relevantChunks = this.retrieveRelevantContext(
|
|
@@ -775,11 +800,11 @@ export class SpatialReasoningEngine {
|
|
|
775
800
|
|
|
776
801
|
// Build message content - use multimodal format if we have an image
|
|
777
802
|
let messageContent: any;
|
|
778
|
-
if (
|
|
779
|
-
// Vision-capable multimodal message format (
|
|
803
|
+
if (imageData) {
|
|
804
|
+
// Vision-capable multimodal message format (Anthropic native format)
|
|
780
805
|
messageContent = [
|
|
781
806
|
{ type: 'text', text: prompt },
|
|
782
|
-
|
|
807
|
+
buildImageContent(imageData),
|
|
783
808
|
];
|
|
784
809
|
} else {
|
|
785
810
|
// Fallback to text-only if image conversion failed
|
|
@@ -863,7 +888,7 @@ Generate ONLY the description, nothing else:`;
|
|
|
863
888
|
|
|
864
889
|
try {
|
|
865
890
|
// Convert image to base64 for vision LLM
|
|
866
|
-
const
|
|
891
|
+
const imageData = await mediaObjectToBase64(mediaObject);
|
|
867
892
|
|
|
868
893
|
const prompt = `Analyze this security camera image. A ${objectClass} was detected.
|
|
869
894
|
|
|
@@ -880,11 +905,11 @@ If no clear landmark is identifiable, respond with: {"name": null}`;
|
|
|
880
905
|
|
|
881
906
|
// Build message content - use multimodal format if we have an image
|
|
882
907
|
let messageContent: any;
|
|
883
|
-
if (
|
|
884
|
-
// Vision-capable multimodal message format (
|
|
908
|
+
if (imageData) {
|
|
909
|
+
// Vision-capable multimodal message format (Anthropic native format)
|
|
885
910
|
messageContent = [
|
|
886
911
|
{ type: 'text', text: prompt },
|
|
887
|
-
|
|
912
|
+
buildImageContent(imageData),
|
|
888
913
|
];
|
|
889
914
|
} else {
|
|
890
915
|
// Fallback to text-only if image conversion failed
|
|
@@ -30,7 +30,7 @@ import {
|
|
|
30
30
|
Landmark,
|
|
31
31
|
findCamera,
|
|
32
32
|
} from '../models/topology';
|
|
33
|
-
import { mediaObjectToBase64 } from './spatial-reasoning';
|
|
33
|
+
import { mediaObjectToBase64, buildImageContent, ImageData } from './spatial-reasoning';
|
|
34
34
|
|
|
35
35
|
const { systemManager } = sdk;
|
|
36
36
|
|
|
@@ -191,8 +191,8 @@ export class TopologyDiscoveryEngine {
|
|
|
191
191
|
return null;
|
|
192
192
|
}
|
|
193
193
|
|
|
194
|
-
/** Get camera snapshot as
|
|
195
|
-
private async getCameraSnapshot(cameraId: string): Promise<
|
|
194
|
+
/** Get camera snapshot as ImageData */
|
|
195
|
+
private async getCameraSnapshot(cameraId: string): Promise<ImageData | null> {
|
|
196
196
|
try {
|
|
197
197
|
const camera = systemManager.getDeviceById<Camera>(cameraId);
|
|
198
198
|
if (!camera?.interfaces?.includes(ScryptedInterface.Camera)) {
|
|
@@ -230,21 +230,21 @@ export class TopologyDiscoveryEngine {
|
|
|
230
230
|
return analysis;
|
|
231
231
|
}
|
|
232
232
|
|
|
233
|
-
const
|
|
234
|
-
if (!
|
|
233
|
+
const imageData = await this.getCameraSnapshot(cameraId);
|
|
234
|
+
if (!imageData) {
|
|
235
235
|
analysis.error = 'Failed to capture camera snapshot';
|
|
236
236
|
return analysis;
|
|
237
237
|
}
|
|
238
238
|
|
|
239
239
|
try {
|
|
240
|
-
// Build multimodal message
|
|
240
|
+
// Build multimodal message with Anthropic-native format
|
|
241
241
|
const result = await llm.getChatCompletion({
|
|
242
242
|
messages: [
|
|
243
243
|
{
|
|
244
244
|
role: 'user',
|
|
245
245
|
content: [
|
|
246
246
|
{ type: 'text', text: SCENE_ANALYSIS_PROMPT },
|
|
247
|
-
|
|
247
|
+
buildImageContent(imageData),
|
|
248
248
|
],
|
|
249
249
|
},
|
|
250
250
|
],
|
|
@@ -387,6 +387,14 @@ export class TopologyDiscoveryEngine {
|
|
|
387
387
|
this.status.camerasAnalyzed = analyses.length;
|
|
388
388
|
this.console.log(`[Discovery] Analyzed ${analyses.length} cameras successfully`);
|
|
389
389
|
|
|
390
|
+
// Handle case where no cameras were successfully analyzed
|
|
391
|
+
if (analyses.length === 0) {
|
|
392
|
+
this.console.warn('[Discovery] No cameras were successfully analyzed');
|
|
393
|
+
this.status.lastError = 'No cameras were successfully analyzed - check LLM configuration';
|
|
394
|
+
this.status.lastScanTime = Date.now();
|
|
395
|
+
return null;
|
|
396
|
+
}
|
|
397
|
+
|
|
390
398
|
// Correlate if we have multiple cameras
|
|
391
399
|
let correlation: TopologyCorrelation | null = null;
|
|
392
400
|
if (analyses.length >= 2) {
|
|
@@ -394,7 +402,7 @@ export class TopologyDiscoveryEngine {
|
|
|
394
402
|
if (correlation) {
|
|
395
403
|
this.generateSuggestionsFromCorrelation(correlation);
|
|
396
404
|
}
|
|
397
|
-
} else {
|
|
405
|
+
} else if (analyses.length === 1) {
|
|
398
406
|
// Single camera - generate suggestions from its analysis
|
|
399
407
|
this.generateSuggestionsFromAnalysis(analyses[0]);
|
|
400
408
|
}
|
package/.vscode/settings.json
DELETED