@blueharford/scrypted-spatial-awareness 0.5.1 → 0.5.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/main.nodejs.js +1 -1
- package/dist/main.nodejs.js.map +1 -1
- package/dist/plugin.zip +0 -0
- package/out/main.nodejs.js +43 -19
- package/out/main.nodejs.js.map +1 -1
- package/out/plugin.zip +0 -0
- package/package.json +1 -1
- package/src/core/spatial-reasoning.ts +39 -14
- package/src/core/topology-discovery.ts +16 -8
package/dist/plugin.zip
CHANGED
|
Binary file
|
package/out/main.nodejs.js
CHANGED
|
@@ -35094,29 +35094,46 @@ var __importStar = (this && this.__importStar) || (function () {
|
|
|
35094
35094
|
Object.defineProperty(exports, "__esModule", ({ value: true }));
|
|
35095
35095
|
exports.SpatialReasoningEngine = void 0;
|
|
35096
35096
|
exports.mediaObjectToBase64 = mediaObjectToBase64;
|
|
35097
|
+
exports.buildImageContent = buildImageContent;
|
|
35097
35098
|
const sdk_1 = __importStar(__webpack_require__(/*! @scrypted/sdk */ "./node_modules/@scrypted/sdk/dist/src/index.js"));
|
|
35098
35099
|
const topology_1 = __webpack_require__(/*! ../models/topology */ "./src/models/topology.ts");
|
|
35099
35100
|
const { systemManager, mediaManager } = sdk_1.default;
|
|
35100
35101
|
/**
|
|
35101
|
-
* Convert a MediaObject to
|
|
35102
|
+
* Convert a MediaObject to base64 image data for vision LLM consumption
|
|
35102
35103
|
* @param mediaObject - MediaObject from camera.takePicture()
|
|
35103
|
-
* @returns
|
|
35104
|
+
* @returns ImageData with raw base64 and media type, or null if conversion fails
|
|
35104
35105
|
*/
|
|
35105
35106
|
async function mediaObjectToBase64(mediaObject) {
|
|
35106
35107
|
try {
|
|
35107
35108
|
// Convert MediaObject to Buffer using mediaManager
|
|
35108
35109
|
const buffer = await mediaManager.convertMediaObjectToBuffer(mediaObject, sdk_1.ScryptedMimeTypes.Image);
|
|
35109
|
-
// Convert buffer to base64
|
|
35110
|
+
// Convert buffer to base64 (raw, no data URL prefix)
|
|
35110
35111
|
const base64 = buffer.toString('base64');
|
|
35111
35112
|
// Determine MIME type - default to JPEG for camera images
|
|
35112
|
-
const
|
|
35113
|
-
return
|
|
35113
|
+
const mediaType = mediaObject.mimeType?.split(';')[0] || 'image/jpeg';
|
|
35114
|
+
return { base64, mediaType };
|
|
35114
35115
|
}
|
|
35115
35116
|
catch (e) {
|
|
35116
35117
|
console.warn('Failed to convert MediaObject to base64:', e);
|
|
35117
35118
|
return null;
|
|
35118
35119
|
}
|
|
35119
35120
|
}
|
|
35121
|
+
/**
|
|
35122
|
+
* Build image content block for ChatCompletion API
|
|
35123
|
+
* Compatible with both OpenAI and Anthropic formats via @scrypted/llm
|
|
35124
|
+
*/
|
|
35125
|
+
function buildImageContent(imageData) {
|
|
35126
|
+
// Use Anthropic's native format which @scrypted/llm should translate
|
|
35127
|
+
// This format is more explicit about the base64 data
|
|
35128
|
+
return {
|
|
35129
|
+
type: 'image',
|
|
35130
|
+
source: {
|
|
35131
|
+
type: 'base64',
|
|
35132
|
+
media_type: imageData.mediaType,
|
|
35133
|
+
data: imageData.base64,
|
|
35134
|
+
},
|
|
35135
|
+
};
|
|
35136
|
+
}
|
|
35120
35137
|
class SpatialReasoningEngine {
|
|
35121
35138
|
config;
|
|
35122
35139
|
console;
|
|
@@ -35661,7 +35678,7 @@ class SpatialReasoningEngine {
|
|
|
35661
35678
|
return null;
|
|
35662
35679
|
try {
|
|
35663
35680
|
// Convert image to base64 for vision LLM
|
|
35664
|
-
const
|
|
35681
|
+
const imageData = await mediaObjectToBase64(mediaObject);
|
|
35665
35682
|
// Retrieve relevant context for RAG
|
|
35666
35683
|
const relevantChunks = this.retrieveRelevantContext(fromCamera.deviceId, toCamera.deviceId);
|
|
35667
35684
|
// Build RAG context
|
|
@@ -35670,11 +35687,11 @@ class SpatialReasoningEngine {
|
|
|
35670
35687
|
const prompt = this.buildLlmPrompt(tracked, fromCamera, toCamera, transitTime, fromLandmarks, toLandmarks, ragContext);
|
|
35671
35688
|
// Build message content - use multimodal format if we have an image
|
|
35672
35689
|
let messageContent;
|
|
35673
|
-
if (
|
|
35674
|
-
// Vision-capable multimodal message format (
|
|
35690
|
+
if (imageData) {
|
|
35691
|
+
// Vision-capable multimodal message format (Anthropic native format)
|
|
35675
35692
|
messageContent = [
|
|
35676
35693
|
{ type: 'text', text: prompt },
|
|
35677
|
-
|
|
35694
|
+
buildImageContent(imageData),
|
|
35678
35695
|
];
|
|
35679
35696
|
}
|
|
35680
35697
|
else {
|
|
@@ -35741,7 +35758,7 @@ Generate ONLY the description, nothing else:`;
|
|
|
35741
35758
|
return null;
|
|
35742
35759
|
try {
|
|
35743
35760
|
// Convert image to base64 for vision LLM
|
|
35744
|
-
const
|
|
35761
|
+
const imageData = await mediaObjectToBase64(mediaObject);
|
|
35745
35762
|
const prompt = `Analyze this security camera image. A ${objectClass} was detected.
|
|
35746
35763
|
|
|
35747
35764
|
Looking at the surroundings and environment, identify any notable landmarks or features visible that could help describe this location. Consider:
|
|
@@ -35756,11 +35773,11 @@ If you can identify a clear landmark feature, respond with ONLY a JSON object:
|
|
|
35756
35773
|
If no clear landmark is identifiable, respond with: {"name": null}`;
|
|
35757
35774
|
// Build message content - use multimodal format if we have an image
|
|
35758
35775
|
let messageContent;
|
|
35759
|
-
if (
|
|
35760
|
-
// Vision-capable multimodal message format (
|
|
35776
|
+
if (imageData) {
|
|
35777
|
+
// Vision-capable multimodal message format (Anthropic native format)
|
|
35761
35778
|
messageContent = [
|
|
35762
35779
|
{ type: 'text', text: prompt },
|
|
35763
|
-
|
|
35780
|
+
buildImageContent(imageData),
|
|
35764
35781
|
];
|
|
35765
35782
|
}
|
|
35766
35783
|
else {
|
|
@@ -36075,7 +36092,7 @@ class TopologyDiscoveryEngine {
|
|
|
36075
36092
|
}
|
|
36076
36093
|
return null;
|
|
36077
36094
|
}
|
|
36078
|
-
/** Get camera snapshot as
|
|
36095
|
+
/** Get camera snapshot as ImageData */
|
|
36079
36096
|
async getCameraSnapshot(cameraId) {
|
|
36080
36097
|
try {
|
|
36081
36098
|
const camera = systemManager.getDeviceById(cameraId);
|
|
@@ -36110,20 +36127,20 @@ class TopologyDiscoveryEngine {
|
|
|
36110
36127
|
analysis.error = 'No LLM device available';
|
|
36111
36128
|
return analysis;
|
|
36112
36129
|
}
|
|
36113
|
-
const
|
|
36114
|
-
if (!
|
|
36130
|
+
const imageData = await this.getCameraSnapshot(cameraId);
|
|
36131
|
+
if (!imageData) {
|
|
36115
36132
|
analysis.error = 'Failed to capture camera snapshot';
|
|
36116
36133
|
return analysis;
|
|
36117
36134
|
}
|
|
36118
36135
|
try {
|
|
36119
|
-
// Build multimodal message
|
|
36136
|
+
// Build multimodal message with Anthropic-native format
|
|
36120
36137
|
const result = await llm.getChatCompletion({
|
|
36121
36138
|
messages: [
|
|
36122
36139
|
{
|
|
36123
36140
|
role: 'user',
|
|
36124
36141
|
content: [
|
|
36125
36142
|
{ type: 'text', text: SCENE_ANALYSIS_PROMPT },
|
|
36126
|
-
|
|
36143
|
+
(0, spatial_reasoning_1.buildImageContent)(imageData),
|
|
36127
36144
|
],
|
|
36128
36145
|
},
|
|
36129
36146
|
],
|
|
@@ -36258,6 +36275,13 @@ class TopologyDiscoveryEngine {
|
|
|
36258
36275
|
}
|
|
36259
36276
|
this.status.camerasAnalyzed = analyses.length;
|
|
36260
36277
|
this.console.log(`[Discovery] Analyzed ${analyses.length} cameras successfully`);
|
|
36278
|
+
// Handle case where no cameras were successfully analyzed
|
|
36279
|
+
if (analyses.length === 0) {
|
|
36280
|
+
this.console.warn('[Discovery] No cameras were successfully analyzed');
|
|
36281
|
+
this.status.lastError = 'No cameras were successfully analyzed - check LLM configuration';
|
|
36282
|
+
this.status.lastScanTime = Date.now();
|
|
36283
|
+
return null;
|
|
36284
|
+
}
|
|
36261
36285
|
// Correlate if we have multiple cameras
|
|
36262
36286
|
let correlation = null;
|
|
36263
36287
|
if (analyses.length >= 2) {
|
|
@@ -36266,7 +36290,7 @@ class TopologyDiscoveryEngine {
|
|
|
36266
36290
|
this.generateSuggestionsFromCorrelation(correlation);
|
|
36267
36291
|
}
|
|
36268
36292
|
}
|
|
36269
|
-
else {
|
|
36293
|
+
else if (analyses.length === 1) {
|
|
36270
36294
|
// Single camera - generate suggestions from its analysis
|
|
36271
36295
|
this.generateSuggestionsFromAnalysis(analyses[0]);
|
|
36272
36296
|
}
|