@blueharford/scrypted-spatial-awareness 0.5.1 → 0.5.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/plugin.zip CHANGED
Binary file
@@ -35094,29 +35094,46 @@ var __importStar = (this && this.__importStar) || (function () {
35094
35094
  Object.defineProperty(exports, "__esModule", ({ value: true }));
35095
35095
  exports.SpatialReasoningEngine = void 0;
35096
35096
  exports.mediaObjectToBase64 = mediaObjectToBase64;
35097
+ exports.buildImageContent = buildImageContent;
35097
35098
  const sdk_1 = __importStar(__webpack_require__(/*! @scrypted/sdk */ "./node_modules/@scrypted/sdk/dist/src/index.js"));
35098
35099
  const topology_1 = __webpack_require__(/*! ../models/topology */ "./src/models/topology.ts");
35099
35100
  const { systemManager, mediaManager } = sdk_1.default;
35100
35101
  /**
35101
- * Convert a MediaObject to a base64 data URL for vision LLM consumption
35102
+ * Convert a MediaObject to base64 image data for vision LLM consumption
35102
35103
  * @param mediaObject - MediaObject from camera.takePicture()
35103
- * @returns Base64 data URL (data:image/jpeg;base64,...) or null if conversion fails
35104
+ * @returns ImageData with raw base64 and media type, or null if conversion fails
35104
35105
  */
35105
35106
  async function mediaObjectToBase64(mediaObject) {
35106
35107
  try {
35107
35108
  // Convert MediaObject to Buffer using mediaManager
35108
35109
  const buffer = await mediaManager.convertMediaObjectToBuffer(mediaObject, sdk_1.ScryptedMimeTypes.Image);
35109
- // Convert buffer to base64
35110
+ // Convert buffer to base64 (raw, no data URL prefix)
35110
35111
  const base64 = buffer.toString('base64');
35111
35112
  // Determine MIME type - default to JPEG for camera images
35112
- const mimeType = mediaObject.mimeType?.split(';')[0] || 'image/jpeg';
35113
- return `data:${mimeType};base64,${base64}`;
35113
+ const mediaType = mediaObject.mimeType?.split(';')[0] || 'image/jpeg';
35114
+ return { base64, mediaType };
35114
35115
  }
35115
35116
  catch (e) {
35116
35117
  console.warn('Failed to convert MediaObject to base64:', e);
35117
35118
  return null;
35118
35119
  }
35119
35120
  }
35121
+ /**
35122
+ * Build image content block for ChatCompletion API
35123
+ * Compatible with both OpenAI and Anthropic formats via @scrypted/llm
35124
+ */
35125
+ function buildImageContent(imageData) {
35126
+ // Use Anthropic's native format which @scrypted/llm should translate
35127
+ // This format is more explicit about the base64 data
35128
+ return {
35129
+ type: 'image',
35130
+ source: {
35131
+ type: 'base64',
35132
+ media_type: imageData.mediaType,
35133
+ data: imageData.base64,
35134
+ },
35135
+ };
35136
+ }
35120
35137
  class SpatialReasoningEngine {
35121
35138
  config;
35122
35139
  console;
@@ -35661,7 +35678,7 @@ class SpatialReasoningEngine {
35661
35678
  return null;
35662
35679
  try {
35663
35680
  // Convert image to base64 for vision LLM
35664
- const imageBase64 = await mediaObjectToBase64(mediaObject);
35681
+ const imageData = await mediaObjectToBase64(mediaObject);
35665
35682
  // Retrieve relevant context for RAG
35666
35683
  const relevantChunks = this.retrieveRelevantContext(fromCamera.deviceId, toCamera.deviceId);
35667
35684
  // Build RAG context
@@ -35670,11 +35687,11 @@ class SpatialReasoningEngine {
35670
35687
  const prompt = this.buildLlmPrompt(tracked, fromCamera, toCamera, transitTime, fromLandmarks, toLandmarks, ragContext);
35671
35688
  // Build message content - use multimodal format if we have an image
35672
35689
  let messageContent;
35673
- if (imageBase64) {
35674
- // Vision-capable multimodal message format (OpenAI compatible)
35690
+ if (imageData) {
35691
+ // Vision-capable multimodal message format (Anthropic native format)
35675
35692
  messageContent = [
35676
35693
  { type: 'text', text: prompt },
35677
- { type: 'image_url', image_url: { url: imageBase64 } },
35694
+ buildImageContent(imageData),
35678
35695
  ];
35679
35696
  }
35680
35697
  else {
@@ -35741,7 +35758,7 @@ Generate ONLY the description, nothing else:`;
35741
35758
  return null;
35742
35759
  try {
35743
35760
  // Convert image to base64 for vision LLM
35744
- const imageBase64 = await mediaObjectToBase64(mediaObject);
35761
+ const imageData = await mediaObjectToBase64(mediaObject);
35745
35762
  const prompt = `Analyze this security camera image. A ${objectClass} was detected.
35746
35763
 
35747
35764
  Looking at the surroundings and environment, identify any notable landmarks or features visible that could help describe this location. Consider:
@@ -35756,11 +35773,11 @@ If you can identify a clear landmark feature, respond with ONLY a JSON object:
35756
35773
  If no clear landmark is identifiable, respond with: {"name": null}`;
35757
35774
  // Build message content - use multimodal format if we have an image
35758
35775
  let messageContent;
35759
- if (imageBase64) {
35760
- // Vision-capable multimodal message format (OpenAI compatible)
35776
+ if (imageData) {
35777
+ // Vision-capable multimodal message format (Anthropic native format)
35761
35778
  messageContent = [
35762
35779
  { type: 'text', text: prompt },
35763
- { type: 'image_url', image_url: { url: imageBase64 } },
35780
+ buildImageContent(imageData),
35764
35781
  ];
35765
35782
  }
35766
35783
  else {
@@ -36075,7 +36092,7 @@ class TopologyDiscoveryEngine {
36075
36092
  }
36076
36093
  return null;
36077
36094
  }
36078
- /** Get camera snapshot as base64 */
36095
+ /** Get camera snapshot as ImageData */
36079
36096
  async getCameraSnapshot(cameraId) {
36080
36097
  try {
36081
36098
  const camera = systemManager.getDeviceById(cameraId);
@@ -36110,20 +36127,20 @@ class TopologyDiscoveryEngine {
36110
36127
  analysis.error = 'No LLM device available';
36111
36128
  return analysis;
36112
36129
  }
36113
- const imageBase64 = await this.getCameraSnapshot(cameraId);
36114
- if (!imageBase64) {
36130
+ const imageData = await this.getCameraSnapshot(cameraId);
36131
+ if (!imageData) {
36115
36132
  analysis.error = 'Failed to capture camera snapshot';
36116
36133
  return analysis;
36117
36134
  }
36118
36135
  try {
36119
- // Build multimodal message
36136
+ // Build multimodal message with Anthropic-native format
36120
36137
  const result = await llm.getChatCompletion({
36121
36138
  messages: [
36122
36139
  {
36123
36140
  role: 'user',
36124
36141
  content: [
36125
36142
  { type: 'text', text: SCENE_ANALYSIS_PROMPT },
36126
- { type: 'image_url', image_url: { url: imageBase64 } },
36143
+ (0, spatial_reasoning_1.buildImageContent)(imageData),
36127
36144
  ],
36128
36145
  },
36129
36146
  ],
@@ -36258,6 +36275,13 @@ class TopologyDiscoveryEngine {
36258
36275
  }
36259
36276
  this.status.camerasAnalyzed = analyses.length;
36260
36277
  this.console.log(`[Discovery] Analyzed ${analyses.length} cameras successfully`);
36278
+ // Handle case where no cameras were successfully analyzed
36279
+ if (analyses.length === 0) {
36280
+ this.console.warn('[Discovery] No cameras were successfully analyzed');
36281
+ this.status.lastError = 'No cameras were successfully analyzed - check LLM configuration';
36282
+ this.status.lastScanTime = Date.now();
36283
+ return null;
36284
+ }
36261
36285
  // Correlate if we have multiple cameras
36262
36286
  let correlation = null;
36263
36287
  if (analyses.length >= 2) {
@@ -36266,7 +36290,7 @@ class TopologyDiscoveryEngine {
36266
36290
  this.generateSuggestionsFromCorrelation(correlation);
36267
36291
  }
36268
36292
  }
36269
- else {
36293
+ else if (analyses.length === 1) {
36270
36294
  // Single camera - generate suggestions from its analysis
36271
36295
  this.generateSuggestionsFromAnalysis(analyses[0]);
36272
36296
  }