@blueharford/scrypted-spatial-awareness 0.5.1 → 0.5.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/plugin.zip CHANGED
Binary file
@@ -35094,29 +35094,68 @@ var __importStar = (this && this.__importStar) || (function () {
35094
35094
  Object.defineProperty(exports, "__esModule", ({ value: true }));
35095
35095
  exports.SpatialReasoningEngine = void 0;
35096
35096
  exports.mediaObjectToBase64 = mediaObjectToBase64;
35097
+ exports.buildImageContent = buildImageContent;
35097
35098
  const sdk_1 = __importStar(__webpack_require__(/*! @scrypted/sdk */ "./node_modules/@scrypted/sdk/dist/src/index.js"));
35098
35099
  const topology_1 = __webpack_require__(/*! ../models/topology */ "./src/models/topology.ts");
35099
35100
  const { systemManager, mediaManager } = sdk_1.default;
35100
35101
  /**
35101
- * Convert a MediaObject to a base64 data URL for vision LLM consumption
35102
+ * Convert a MediaObject to base64 image data for vision LLM consumption
35102
35103
  * @param mediaObject - MediaObject from camera.takePicture()
35103
- * @returns Base64 data URL (data:image/jpeg;base64,...) or null if conversion fails
35104
+ * @returns ImageData with raw base64 and media type, or null if conversion fails
35104
35105
  */
35105
35106
  async function mediaObjectToBase64(mediaObject) {
35106
35107
  try {
35107
35108
  // Convert MediaObject to Buffer using mediaManager
35108
35109
  const buffer = await mediaManager.convertMediaObjectToBuffer(mediaObject, sdk_1.ScryptedMimeTypes.Image);
35109
- // Convert buffer to base64
35110
+ // Convert buffer to base64 (raw, no data URL prefix)
35110
35111
  const base64 = buffer.toString('base64');
35111
35112
  // Determine MIME type - default to JPEG for camera images
35112
- const mimeType = mediaObject.mimeType?.split(';')[0] || 'image/jpeg';
35113
- return `data:${mimeType};base64,${base64}`;
35113
+ const mediaType = mediaObject.mimeType?.split(';')[0] || 'image/jpeg';
35114
+ return { base64, mediaType };
35114
35115
  }
35115
35116
  catch (e) {
35116
35117
  console.warn('Failed to convert MediaObject to base64:', e);
35117
35118
  return null;
35118
35119
  }
35119
35120
  }
35121
+ /**
35122
+ * Build image content block for ChatCompletion API
35123
+ * Supports both OpenAI and Anthropic formats
35124
+ * @param imageData - Image data with base64 and media type
35125
+ * @param provider - The LLM provider type (openai, anthropic, or unknown)
35126
+ */
35127
+ function buildImageContent(imageData, provider = 'unknown') {
35128
+ if (provider === 'openai') {
35129
+ // OpenAI format: uses data URL with image_url wrapper
35130
+ return {
35131
+ type: 'image_url',
35132
+ image_url: {
35133
+ url: `data:${imageData.mediaType};base64,${imageData.base64}`,
35134
+ },
35135
+ };
35136
+ }
35137
+ else if (provider === 'anthropic') {
35138
+ // Anthropic format: uses separate base64 data and media_type
35139
+ return {
35140
+ type: 'image',
35141
+ source: {
35142
+ type: 'base64',
35143
+ media_type: imageData.mediaType,
35144
+ data: imageData.base64,
35145
+ },
35146
+ };
35147
+ }
35148
+ else {
35149
+ // Unknown provider: try OpenAI format as it's more commonly supported
35150
+ // Most LLM wrappers (including @scrypted/llm) understand the OpenAI format
35151
+ return {
35152
+ type: 'image_url',
35153
+ image_url: {
35154
+ url: `data:${imageData.mediaType};base64,${imageData.base64}`,
35155
+ },
35156
+ };
35157
+ }
35158
+ }
35120
35159
  class SpatialReasoningEngine {
35121
35160
  config;
35122
35161
  console;
@@ -35336,6 +35375,7 @@ class SpatialReasoningEngine {
35336
35375
  }
35337
35376
  llmSearched = false;
35338
35377
  llmProvider = null;
35378
+ llmProviderType = 'unknown';
35339
35379
  /** Find or initialize LLM device - looks for ChatCompletion interface from @scrypted/llm plugin */
35340
35380
  async findLlmDevice() {
35341
35381
  if (this.llmDevice)
@@ -35354,30 +35394,39 @@ class SpatialReasoningEngine {
35354
35394
  if (device.interfaces?.includes('ChatCompletion')) {
35355
35395
  const deviceName = device.name?.toLowerCase() || '';
35356
35396
  const pluginId = device.pluginId?.toLowerCase() || '';
35357
- // Identify the provider type for logging
35397
+ // Identify the provider type for logging and image format selection
35358
35398
  let providerType = 'Unknown';
35359
- if (pluginId.includes('@scrypted/llm') || pluginId.includes('llm')) {
35360
- providerType = 'Scrypted LLM';
35361
- }
35399
+ let providerTypeEnum = 'unknown';
35362
35400
  if (deviceName.includes('openai') || deviceName.includes('gpt')) {
35363
35401
  providerType = 'OpenAI';
35402
+ providerTypeEnum = 'openai';
35364
35403
  }
35365
35404
  else if (deviceName.includes('anthropic') || deviceName.includes('claude')) {
35366
35405
  providerType = 'Anthropic';
35406
+ providerTypeEnum = 'anthropic';
35367
35407
  }
35368
35408
  else if (deviceName.includes('ollama')) {
35369
35409
  providerType = 'Ollama';
35410
+ providerTypeEnum = 'openai'; // Ollama uses OpenAI-compatible format
35370
35411
  }
35371
35412
  else if (deviceName.includes('gemini') || deviceName.includes('google')) {
35372
35413
  providerType = 'Google';
35414
+ providerTypeEnum = 'openai'; // Google uses OpenAI-compatible format
35373
35415
  }
35374
35416
  else if (deviceName.includes('llama')) {
35375
35417
  providerType = 'llama.cpp';
35418
+ providerTypeEnum = 'openai'; // llama.cpp uses OpenAI-compatible format
35419
+ }
35420
+ else if (pluginId.includes('@scrypted/llm') || pluginId.includes('llm')) {
35421
+ providerType = 'Scrypted LLM';
35422
+ providerTypeEnum = 'unknown';
35376
35423
  }
35377
35424
  this.llmDevice = device;
35378
35425
  this.llmProvider = `${providerType} (${device.name})`;
35426
+ this.llmProviderType = providerTypeEnum;
35379
35427
  this.console.log(`[LLM] Connected to ${providerType}: ${device.name}`);
35380
35428
  this.console.log(`[LLM] Plugin: ${pluginId || 'N/A'}`);
35429
+ this.console.log(`[LLM] Image format: ${providerTypeEnum}`);
35381
35430
  this.console.log(`[LLM] Interfaces: ${device.interfaces?.join(', ')}`);
35382
35431
  return this.llmDevice;
35383
35432
  }
@@ -35395,6 +35444,10 @@ class SpatialReasoningEngine {
35395
35444
  getLlmProvider() {
35396
35445
  return this.llmProvider;
35397
35446
  }
35447
+ /** Get the current LLM provider type for image format selection */
35448
+ getLlmProviderType() {
35449
+ return this.llmProviderType;
35450
+ }
35398
35451
  /** Check if LLM is available */
35399
35452
  isLlmAvailable() {
35400
35453
  return this.llmDevice !== null;
@@ -35661,7 +35714,7 @@ class SpatialReasoningEngine {
35661
35714
  return null;
35662
35715
  try {
35663
35716
  // Convert image to base64 for vision LLM
35664
- const imageBase64 = await mediaObjectToBase64(mediaObject);
35717
+ const imageData = await mediaObjectToBase64(mediaObject);
35665
35718
  // Retrieve relevant context for RAG
35666
35719
  const relevantChunks = this.retrieveRelevantContext(fromCamera.deviceId, toCamera.deviceId);
35667
35720
  // Build RAG context
@@ -35670,11 +35723,11 @@ class SpatialReasoningEngine {
35670
35723
  const prompt = this.buildLlmPrompt(tracked, fromCamera, toCamera, transitTime, fromLandmarks, toLandmarks, ragContext);
35671
35724
  // Build message content - use multimodal format if we have an image
35672
35725
  let messageContent;
35673
- if (imageBase64) {
35674
- // Vision-capable multimodal message format (OpenAI compatible)
35726
+ if (imageData) {
35727
+ // Vision-capable multimodal message format (provider-specific)
35675
35728
  messageContent = [
35676
35729
  { type: 'text', text: prompt },
35677
- { type: 'image_url', image_url: { url: imageBase64 } },
35730
+ buildImageContent(imageData, this.llmProviderType),
35678
35731
  ];
35679
35732
  }
35680
35733
  else {
@@ -35741,7 +35794,7 @@ Generate ONLY the description, nothing else:`;
35741
35794
  return null;
35742
35795
  try {
35743
35796
  // Convert image to base64 for vision LLM
35744
- const imageBase64 = await mediaObjectToBase64(mediaObject);
35797
+ const imageData = await mediaObjectToBase64(mediaObject);
35745
35798
  const prompt = `Analyze this security camera image. A ${objectClass} was detected.
35746
35799
 
35747
35800
  Looking at the surroundings and environment, identify any notable landmarks or features visible that could help describe this location. Consider:
@@ -35756,11 +35809,11 @@ If you can identify a clear landmark feature, respond with ONLY a JSON object:
35756
35809
  If no clear landmark is identifiable, respond with: {"name": null}`;
35757
35810
  // Build message content - use multimodal format if we have an image
35758
35811
  let messageContent;
35759
- if (imageBase64) {
35760
- // Vision-capable multimodal message format (OpenAI compatible)
35812
+ if (imageData) {
35813
+ // Vision-capable multimodal message format (provider-specific)
35761
35814
  messageContent = [
35762
35815
  { type: 'text', text: prompt },
35763
- { type: 'image_url', image_url: { url: imageBase64 } },
35816
+ buildImageContent(imageData, this.llmProviderType),
35764
35817
  ];
35765
35818
  }
35766
35819
  else {
@@ -36000,6 +36053,7 @@ class TopologyDiscoveryEngine {
36000
36053
  topology = null;
36001
36054
  llmDevice = null;
36002
36055
  llmSearched = false;
36056
+ llmProviderType = 'unknown';
36003
36057
  // Scene analysis cache (camera ID -> analysis)
36004
36058
  sceneCache = new Map();
36005
36059
  // Pending suggestions for user review
@@ -36063,8 +36117,25 @@ class TopologyDiscoveryEngine {
36063
36117
  if (!device)
36064
36118
  continue;
36065
36119
  if (device.interfaces?.includes('ChatCompletion')) {
36120
+ const deviceName = device.name?.toLowerCase() || '';
36121
+ // Detect provider type for image format selection
36122
+ if (deviceName.includes('openai') || deviceName.includes('gpt')) {
36123
+ this.llmProviderType = 'openai';
36124
+ }
36125
+ else if (deviceName.includes('anthropic') || deviceName.includes('claude')) {
36126
+ this.llmProviderType = 'anthropic';
36127
+ }
36128
+ else if (deviceName.includes('ollama') || deviceName.includes('gemini') ||
36129
+ deviceName.includes('google') || deviceName.includes('llama')) {
36130
+ // These providers use OpenAI-compatible format
36131
+ this.llmProviderType = 'openai';
36132
+ }
36133
+ else {
36134
+ this.llmProviderType = 'unknown';
36135
+ }
36066
36136
  this.llmDevice = device;
36067
36137
  this.console.log(`[Discovery] Connected to LLM: ${device.name}`);
36138
+ this.console.log(`[Discovery] Image format: ${this.llmProviderType}`);
36068
36139
  return this.llmDevice;
36069
36140
  }
36070
36141
  }
@@ -36075,7 +36146,7 @@ class TopologyDiscoveryEngine {
36075
36146
  }
36076
36147
  return null;
36077
36148
  }
36078
- /** Get camera snapshot as base64 */
36149
+ /** Get camera snapshot as ImageData */
36079
36150
  async getCameraSnapshot(cameraId) {
36080
36151
  try {
36081
36152
  const camera = systemManager.getDeviceById(cameraId);
@@ -36110,20 +36181,20 @@ class TopologyDiscoveryEngine {
36110
36181
  analysis.error = 'No LLM device available';
36111
36182
  return analysis;
36112
36183
  }
36113
- const imageBase64 = await this.getCameraSnapshot(cameraId);
36114
- if (!imageBase64) {
36184
+ const imageData = await this.getCameraSnapshot(cameraId);
36185
+ if (!imageData) {
36115
36186
  analysis.error = 'Failed to capture camera snapshot';
36116
36187
  return analysis;
36117
36188
  }
36118
36189
  try {
36119
- // Build multimodal message
36190
+ // Build multimodal message with provider-specific image format
36120
36191
  const result = await llm.getChatCompletion({
36121
36192
  messages: [
36122
36193
  {
36123
36194
  role: 'user',
36124
36195
  content: [
36125
36196
  { type: 'text', text: SCENE_ANALYSIS_PROMPT },
36126
- { type: 'image_url', image_url: { url: imageBase64 } },
36197
+ (0, spatial_reasoning_1.buildImageContent)(imageData, this.llmProviderType),
36127
36198
  ],
36128
36199
  },
36129
36200
  ],
@@ -36258,6 +36329,13 @@ class TopologyDiscoveryEngine {
36258
36329
  }
36259
36330
  this.status.camerasAnalyzed = analyses.length;
36260
36331
  this.console.log(`[Discovery] Analyzed ${analyses.length} cameras successfully`);
36332
+ // Handle case where no cameras were successfully analyzed
36333
+ if (analyses.length === 0) {
36334
+ this.console.warn('[Discovery] No cameras were successfully analyzed');
36335
+ this.status.lastError = 'No cameras were successfully analyzed - check LLM configuration';
36336
+ this.status.lastScanTime = Date.now();
36337
+ return null;
36338
+ }
36261
36339
  // Correlate if we have multiple cameras
36262
36340
  let correlation = null;
36263
36341
  if (analyses.length >= 2) {
@@ -36266,7 +36344,7 @@ class TopologyDiscoveryEngine {
36266
36344
  this.generateSuggestionsFromCorrelation(correlation);
36267
36345
  }
36268
36346
  }
36269
- else {
36347
+ else if (analyses.length === 1) {
36270
36348
  // Single camera - generate suggestions from its analysis
36271
36349
  this.generateSuggestionsFromAnalysis(analyses[0]);
36272
36350
  }