@blueharford/scrypted-spatial-awareness 0.5.1 → 0.5.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/out/plugin.zip CHANGED
Binary file
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@blueharford/scrypted-spatial-awareness",
3
- "version": "0.5.1",
3
+ "version": "0.5.3",
4
4
  "description": "Cross-camera object tracking for Scrypted NVR with spatial awareness",
5
5
  "author": "Joshua Seidel <blueharford>",
6
6
  "license": "Apache-2.0",
@@ -69,29 +69,77 @@ interface ChatCompletionDevice extends ScryptedDevice {
69
69
  streamChatCompletion?(params: any): AsyncGenerator<any>;
70
70
  }
71
71
 
72
+ /** Image data for LLM vision APIs */
73
+ export interface ImageData {
74
+ /** Raw base64 encoded image data (no data URL prefix) */
75
+ base64: string;
76
+ /** MIME type (e.g., 'image/jpeg') */
77
+ mediaType: string;
78
+ }
79
+
72
80
  /**
73
- * Convert a MediaObject to a base64 data URL for vision LLM consumption
81
+ * Convert a MediaObject to base64 image data for vision LLM consumption
74
82
  * @param mediaObject - MediaObject from camera.takePicture()
75
- * @returns Base64 data URL (data:image/jpeg;base64,...) or null if conversion fails
83
+ * @returns ImageData with raw base64 and media type, or null if conversion fails
76
84
  */
77
- export async function mediaObjectToBase64(mediaObject: MediaObject): Promise<string | null> {
85
+ export async function mediaObjectToBase64(mediaObject: MediaObject): Promise<ImageData | null> {
78
86
  try {
79
87
  // Convert MediaObject to Buffer using mediaManager
80
88
  const buffer = await mediaManager.convertMediaObjectToBuffer(mediaObject, ScryptedMimeTypes.Image);
81
89
 
82
- // Convert buffer to base64
90
+ // Convert buffer to base64 (raw, no data URL prefix)
83
91
  const base64 = buffer.toString('base64');
84
92
 
85
93
  // Determine MIME type - default to JPEG for camera images
86
- const mimeType = mediaObject.mimeType?.split(';')[0] || 'image/jpeg';
94
+ const mediaType = mediaObject.mimeType?.split(';')[0] || 'image/jpeg';
87
95
 
88
- return `data:${mimeType};base64,${base64}`;
96
+ return { base64, mediaType };
89
97
  } catch (e) {
90
98
  console.warn('Failed to convert MediaObject to base64:', e);
91
99
  return null;
92
100
  }
93
101
  }
94
102
 
103
+ /** LLM Provider type for image format selection */
104
+ export type LlmProvider = 'openai' | 'anthropic' | 'unknown';
105
+
106
+ /**
107
+ * Build image content block for ChatCompletion API
108
+ * Supports both OpenAI and Anthropic formats
109
+ * @param imageData - Image data with base64 and media type
110
+ * @param provider - The LLM provider type (openai, anthropic, or unknown)
111
+ */
112
+ export function buildImageContent(imageData: ImageData, provider: LlmProvider = 'unknown'): any {
113
+ if (provider === 'openai') {
114
+ // OpenAI format: uses data URL with image_url wrapper
115
+ return {
116
+ type: 'image_url',
117
+ image_url: {
118
+ url: `data:${imageData.mediaType};base64,${imageData.base64}`,
119
+ },
120
+ };
121
+ } else if (provider === 'anthropic') {
122
+ // Anthropic format: uses separate base64 data and media_type
123
+ return {
124
+ type: 'image',
125
+ source: {
126
+ type: 'base64',
127
+ media_type: imageData.mediaType,
128
+ data: imageData.base64,
129
+ },
130
+ };
131
+ } else {
132
+ // Unknown provider: try OpenAI format as it's more commonly supported
133
+ // Most LLM wrappers (including @scrypted/llm) understand the OpenAI format
134
+ return {
135
+ type: 'image_url',
136
+ image_url: {
137
+ url: `data:${imageData.mediaType};base64,${imageData.base64}`,
138
+ },
139
+ };
140
+ }
141
+ }
142
+
95
143
  export class SpatialReasoningEngine {
96
144
  private config: SpatialReasoningConfig;
97
145
  private console: Console;
@@ -336,6 +384,7 @@ export class SpatialReasoningEngine {
336
384
 
337
385
  private llmSearched: boolean = false;
338
386
  private llmProvider: string | null = null;
387
+ private llmProviderType: LlmProvider = 'unknown';
339
388
 
340
389
  /** Find or initialize LLM device - looks for ChatCompletion interface from @scrypted/llm plugin */
341
390
  private async findLlmDevice(): Promise<ChatCompletionDevice | null> {
@@ -356,27 +405,36 @@ export class SpatialReasoningEngine {
356
405
  const deviceName = device.name?.toLowerCase() || '';
357
406
  const pluginId = (device as any).pluginId?.toLowerCase() || '';
358
407
 
359
- // Identify the provider type for logging
408
+ // Identify the provider type for logging and image format selection
360
409
  let providerType = 'Unknown';
361
- if (pluginId.includes('@scrypted/llm') || pluginId.includes('llm')) {
362
- providerType = 'Scrypted LLM';
363
- }
410
+ let providerTypeEnum: LlmProvider = 'unknown';
411
+
364
412
  if (deviceName.includes('openai') || deviceName.includes('gpt')) {
365
413
  providerType = 'OpenAI';
414
+ providerTypeEnum = 'openai';
366
415
  } else if (deviceName.includes('anthropic') || deviceName.includes('claude')) {
367
416
  providerType = 'Anthropic';
417
+ providerTypeEnum = 'anthropic';
368
418
  } else if (deviceName.includes('ollama')) {
369
419
  providerType = 'Ollama';
420
+ providerTypeEnum = 'openai'; // Ollama uses OpenAI-compatible format
370
421
  } else if (deviceName.includes('gemini') || deviceName.includes('google')) {
371
422
  providerType = 'Google';
423
+ providerTypeEnum = 'openai'; // Google uses OpenAI-compatible format
372
424
  } else if (deviceName.includes('llama')) {
373
425
  providerType = 'llama.cpp';
426
+ providerTypeEnum = 'openai'; // llama.cpp uses OpenAI-compatible format
427
+ } else if (pluginId.includes('@scrypted/llm') || pluginId.includes('llm')) {
428
+ providerType = 'Scrypted LLM';
429
+ providerTypeEnum = 'unknown';
374
430
  }
375
431
 
376
432
  this.llmDevice = device as unknown as ChatCompletionDevice;
377
433
  this.llmProvider = `${providerType} (${device.name})`;
434
+ this.llmProviderType = providerTypeEnum;
378
435
  this.console.log(`[LLM] Connected to ${providerType}: ${device.name}`);
379
436
  this.console.log(`[LLM] Plugin: ${pluginId || 'N/A'}`);
437
+ this.console.log(`[LLM] Image format: ${providerTypeEnum}`);
380
438
  this.console.log(`[LLM] Interfaces: ${device.interfaces?.join(', ')}`);
381
439
  return this.llmDevice;
382
440
  }
@@ -398,6 +456,11 @@ export class SpatialReasoningEngine {
398
456
  return this.llmProvider;
399
457
  }
400
458
 
459
+ /** Get the current LLM provider type for image format selection */
460
+ getLlmProviderType(): LlmProvider {
461
+ return this.llmProviderType;
462
+ }
463
+
401
464
  /** Check if LLM is available */
402
465
  isLlmAvailable(): boolean {
403
466
  return this.llmDevice !== null;
@@ -751,7 +814,7 @@ export class SpatialReasoningEngine {
751
814
 
752
815
  try {
753
816
  // Convert image to base64 for vision LLM
754
- const imageBase64 = await mediaObjectToBase64(mediaObject);
817
+ const imageData = await mediaObjectToBase64(mediaObject);
755
818
 
756
819
  // Retrieve relevant context for RAG
757
820
  const relevantChunks = this.retrieveRelevantContext(
@@ -775,11 +838,11 @@ export class SpatialReasoningEngine {
775
838
 
776
839
  // Build message content - use multimodal format if we have an image
777
840
  let messageContent: any;
778
- if (imageBase64) {
779
- // Vision-capable multimodal message format (OpenAI compatible)
841
+ if (imageData) {
842
+ // Vision-capable multimodal message format (provider-specific)
780
843
  messageContent = [
781
844
  { type: 'text', text: prompt },
782
- { type: 'image_url', image_url: { url: imageBase64 } },
845
+ buildImageContent(imageData, this.llmProviderType),
783
846
  ];
784
847
  } else {
785
848
  // Fallback to text-only if image conversion failed
@@ -863,7 +926,7 @@ Generate ONLY the description, nothing else:`;
863
926
 
864
927
  try {
865
928
  // Convert image to base64 for vision LLM
866
- const imageBase64 = await mediaObjectToBase64(mediaObject);
929
+ const imageData = await mediaObjectToBase64(mediaObject);
867
930
 
868
931
  const prompt = `Analyze this security camera image. A ${objectClass} was detected.
869
932
 
@@ -880,11 +943,11 @@ If no clear landmark is identifiable, respond with: {"name": null}`;
880
943
 
881
944
  // Build message content - use multimodal format if we have an image
882
945
  let messageContent: any;
883
- if (imageBase64) {
884
- // Vision-capable multimodal message format (OpenAI compatible)
946
+ if (imageData) {
947
+ // Vision-capable multimodal message format (provider-specific)
885
948
  messageContent = [
886
949
  { type: 'text', text: prompt },
887
- { type: 'image_url', image_url: { url: imageBase64 } },
950
+ buildImageContent(imageData, this.llmProviderType),
888
951
  ];
889
952
  } else {
890
953
  // Fallback to text-only if image conversion failed
@@ -30,7 +30,7 @@ import {
30
30
  Landmark,
31
31
  findCamera,
32
32
  } from '../models/topology';
33
- import { mediaObjectToBase64 } from './spatial-reasoning';
33
+ import { mediaObjectToBase64, buildImageContent, ImageData, LlmProvider } from './spatial-reasoning';
34
34
 
35
35
  const { systemManager } = sdk;
36
36
 
@@ -100,6 +100,7 @@ export class TopologyDiscoveryEngine {
100
100
  private topology: CameraTopology | null = null;
101
101
  private llmDevice: ChatCompletionDevice | null = null;
102
102
  private llmSearched: boolean = false;
103
+ private llmProviderType: LlmProvider = 'unknown';
103
104
 
104
105
  // Scene analysis cache (camera ID -> analysis)
105
106
  private sceneCache: Map<string, SceneAnalysis> = new Map();
@@ -177,8 +178,24 @@ export class TopologyDiscoveryEngine {
177
178
  if (!device) continue;
178
179
 
179
180
  if (device.interfaces?.includes('ChatCompletion')) {
181
+ const deviceName = device.name?.toLowerCase() || '';
182
+
183
+ // Detect provider type for image format selection
184
+ if (deviceName.includes('openai') || deviceName.includes('gpt')) {
185
+ this.llmProviderType = 'openai';
186
+ } else if (deviceName.includes('anthropic') || deviceName.includes('claude')) {
187
+ this.llmProviderType = 'anthropic';
188
+ } else if (deviceName.includes('ollama') || deviceName.includes('gemini') ||
189
+ deviceName.includes('google') || deviceName.includes('llama')) {
190
+ // These providers use OpenAI-compatible format
191
+ this.llmProviderType = 'openai';
192
+ } else {
193
+ this.llmProviderType = 'unknown';
194
+ }
195
+
180
196
  this.llmDevice = device as unknown as ChatCompletionDevice;
181
197
  this.console.log(`[Discovery] Connected to LLM: ${device.name}`);
198
+ this.console.log(`[Discovery] Image format: ${this.llmProviderType}`);
182
199
  return this.llmDevice;
183
200
  }
184
201
  }
@@ -191,8 +208,8 @@ export class TopologyDiscoveryEngine {
191
208
  return null;
192
209
  }
193
210
 
194
- /** Get camera snapshot as base64 */
195
- private async getCameraSnapshot(cameraId: string): Promise<string | null> {
211
+ /** Get camera snapshot as ImageData */
212
+ private async getCameraSnapshot(cameraId: string): Promise<ImageData | null> {
196
213
  try {
197
214
  const camera = systemManager.getDeviceById<Camera>(cameraId);
198
215
  if (!camera?.interfaces?.includes(ScryptedInterface.Camera)) {
@@ -230,21 +247,21 @@ export class TopologyDiscoveryEngine {
230
247
  return analysis;
231
248
  }
232
249
 
233
- const imageBase64 = await this.getCameraSnapshot(cameraId);
234
- if (!imageBase64) {
250
+ const imageData = await this.getCameraSnapshot(cameraId);
251
+ if (!imageData) {
235
252
  analysis.error = 'Failed to capture camera snapshot';
236
253
  return analysis;
237
254
  }
238
255
 
239
256
  try {
240
- // Build multimodal message
257
+ // Build multimodal message with provider-specific image format
241
258
  const result = await llm.getChatCompletion({
242
259
  messages: [
243
260
  {
244
261
  role: 'user',
245
262
  content: [
246
263
  { type: 'text', text: SCENE_ANALYSIS_PROMPT },
247
- { type: 'image_url', image_url: { url: imageBase64 } },
264
+ buildImageContent(imageData, this.llmProviderType),
248
265
  ],
249
266
  },
250
267
  ],
@@ -387,6 +404,14 @@ export class TopologyDiscoveryEngine {
387
404
  this.status.camerasAnalyzed = analyses.length;
388
405
  this.console.log(`[Discovery] Analyzed ${analyses.length} cameras successfully`);
389
406
 
407
+ // Handle case where no cameras were successfully analyzed
408
+ if (analyses.length === 0) {
409
+ this.console.warn('[Discovery] No cameras were successfully analyzed');
410
+ this.status.lastError = 'No cameras were successfully analyzed - check LLM configuration';
411
+ this.status.lastScanTime = Date.now();
412
+ return null;
413
+ }
414
+
390
415
  // Correlate if we have multiple cameras
391
416
  let correlation: TopologyCorrelation | null = null;
392
417
  if (analyses.length >= 2) {
@@ -394,7 +419,7 @@ export class TopologyDiscoveryEngine {
394
419
  if (correlation) {
395
420
  this.generateSuggestionsFromCorrelation(correlation);
396
421
  }
397
- } else {
422
+ } else if (analyses.length === 1) {
398
423
  // Single camera - generate suggestions from its analysis
399
424
  this.generateSuggestionsFromAnalysis(analyses[0]);
400
425
  }