@blueharford/scrypted-spatial-awareness 0.5.2 → 0.5.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/out/plugin.zip CHANGED
Binary file
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@blueharford/scrypted-spatial-awareness",
3
- "version": "0.5.2",
3
+ "version": "0.5.3",
4
4
  "description": "Cross-camera object tracking for Scrypted NVR with spatial awareness",
5
5
  "author": "Joshua Seidel <blueharford>",
6
6
  "license": "Apache-2.0",
@@ -100,21 +100,44 @@ export async function mediaObjectToBase64(mediaObject: MediaObject): Promise<Ima
100
100
  }
101
101
  }
102
102
 
103
+ /** LLM Provider type for image format selection */
104
+ export type LlmProvider = 'openai' | 'anthropic' | 'unknown';
105
+
103
106
  /**
104
107
  * Build image content block for ChatCompletion API
105
- * Compatible with both OpenAI and Anthropic formats via @scrypted/llm
108
+ * Supports both OpenAI and Anthropic formats
109
+ * @param imageData - Image data with base64 and media type
110
+ * @param provider - The LLM provider type (openai, anthropic, or unknown)
106
111
  */
107
- export function buildImageContent(imageData: ImageData): any {
108
- // Use Anthropic's native format which @scrypted/llm should translate
109
- // This format is more explicit about the base64 data
110
- return {
111
- type: 'image',
112
- source: {
113
- type: 'base64',
114
- media_type: imageData.mediaType,
115
- data: imageData.base64,
116
- },
117
- };
112
+ export function buildImageContent(imageData: ImageData, provider: LlmProvider = 'unknown'): any {
113
+ if (provider === 'openai') {
114
+ // OpenAI format: uses data URL with image_url wrapper
115
+ return {
116
+ type: 'image_url',
117
+ image_url: {
118
+ url: `data:${imageData.mediaType};base64,${imageData.base64}`,
119
+ },
120
+ };
121
+ } else if (provider === 'anthropic') {
122
+ // Anthropic format: uses separate base64 data and media_type
123
+ return {
124
+ type: 'image',
125
+ source: {
126
+ type: 'base64',
127
+ media_type: imageData.mediaType,
128
+ data: imageData.base64,
129
+ },
130
+ };
131
+ } else {
132
+ // Unknown provider: try OpenAI format as it's more commonly supported
133
+ // Most LLM wrappers (including @scrypted/llm) understand the OpenAI format
134
+ return {
135
+ type: 'image_url',
136
+ image_url: {
137
+ url: `data:${imageData.mediaType};base64,${imageData.base64}`,
138
+ },
139
+ };
140
+ }
118
141
  }
119
142
 
120
143
  export class SpatialReasoningEngine {
@@ -361,6 +384,7 @@ export class SpatialReasoningEngine {
361
384
 
362
385
  private llmSearched: boolean = false;
363
386
  private llmProvider: string | null = null;
387
+ private llmProviderType: LlmProvider = 'unknown';
364
388
 
365
389
  /** Find or initialize LLM device - looks for ChatCompletion interface from @scrypted/llm plugin */
366
390
  private async findLlmDevice(): Promise<ChatCompletionDevice | null> {
@@ -381,27 +405,36 @@ export class SpatialReasoningEngine {
381
405
  const deviceName = device.name?.toLowerCase() || '';
382
406
  const pluginId = (device as any).pluginId?.toLowerCase() || '';
383
407
 
384
- // Identify the provider type for logging
408
+ // Identify the provider type for logging and image format selection
385
409
  let providerType = 'Unknown';
386
- if (pluginId.includes('@scrypted/llm') || pluginId.includes('llm')) {
387
- providerType = 'Scrypted LLM';
388
- }
410
+ let providerTypeEnum: LlmProvider = 'unknown';
411
+
389
412
  if (deviceName.includes('openai') || deviceName.includes('gpt')) {
390
413
  providerType = 'OpenAI';
414
+ providerTypeEnum = 'openai';
391
415
  } else if (deviceName.includes('anthropic') || deviceName.includes('claude')) {
392
416
  providerType = 'Anthropic';
417
+ providerTypeEnum = 'anthropic';
393
418
  } else if (deviceName.includes('ollama')) {
394
419
  providerType = 'Ollama';
420
+ providerTypeEnum = 'openai'; // Ollama uses OpenAI-compatible format
395
421
  } else if (deviceName.includes('gemini') || deviceName.includes('google')) {
396
422
  providerType = 'Google';
423
+ providerTypeEnum = 'openai'; // Google uses OpenAI-compatible format
397
424
  } else if (deviceName.includes('llama')) {
398
425
  providerType = 'llama.cpp';
426
+ providerTypeEnum = 'openai'; // llama.cpp uses OpenAI-compatible format
427
+ } else if (pluginId.includes('@scrypted/llm') || pluginId.includes('llm')) {
428
+ providerType = 'Scrypted LLM';
429
+ providerTypeEnum = 'unknown';
399
430
  }
400
431
 
401
432
  this.llmDevice = device as unknown as ChatCompletionDevice;
402
433
  this.llmProvider = `${providerType} (${device.name})`;
434
+ this.llmProviderType = providerTypeEnum;
403
435
  this.console.log(`[LLM] Connected to ${providerType}: ${device.name}`);
404
436
  this.console.log(`[LLM] Plugin: ${pluginId || 'N/A'}`);
437
+ this.console.log(`[LLM] Image format: ${providerTypeEnum}`);
405
438
  this.console.log(`[LLM] Interfaces: ${device.interfaces?.join(', ')}`);
406
439
  return this.llmDevice;
407
440
  }
@@ -423,6 +456,11 @@ export class SpatialReasoningEngine {
423
456
  return this.llmProvider;
424
457
  }
425
458
 
459
+ /** Get the current LLM provider type for image format selection */
460
+ getLlmProviderType(): LlmProvider {
461
+ return this.llmProviderType;
462
+ }
463
+
426
464
  /** Check if LLM is available */
427
465
  isLlmAvailable(): boolean {
428
466
  return this.llmDevice !== null;
@@ -801,10 +839,10 @@ export class SpatialReasoningEngine {
801
839
  // Build message content - use multimodal format if we have an image
802
840
  let messageContent: any;
803
841
  if (imageData) {
804
- // Vision-capable multimodal message format (Anthropic native format)
842
+ // Vision-capable multimodal message format (provider-specific)
805
843
  messageContent = [
806
844
  { type: 'text', text: prompt },
807
- buildImageContent(imageData),
845
+ buildImageContent(imageData, this.llmProviderType),
808
846
  ];
809
847
  } else {
810
848
  // Fallback to text-only if image conversion failed
@@ -906,10 +944,10 @@ If no clear landmark is identifiable, respond with: {"name": null}`;
906
944
  // Build message content - use multimodal format if we have an image
907
945
  let messageContent: any;
908
946
  if (imageData) {
909
- // Vision-capable multimodal message format (Anthropic native format)
947
+ // Vision-capable multimodal message format (provider-specific)
910
948
  messageContent = [
911
949
  { type: 'text', text: prompt },
912
- buildImageContent(imageData),
950
+ buildImageContent(imageData, this.llmProviderType),
913
951
  ];
914
952
  } else {
915
953
  // Fallback to text-only if image conversion failed
@@ -30,7 +30,7 @@ import {
30
30
  Landmark,
31
31
  findCamera,
32
32
  } from '../models/topology';
33
- import { mediaObjectToBase64, buildImageContent, ImageData } from './spatial-reasoning';
33
+ import { mediaObjectToBase64, buildImageContent, ImageData, LlmProvider } from './spatial-reasoning';
34
34
 
35
35
  const { systemManager } = sdk;
36
36
 
@@ -100,6 +100,7 @@ export class TopologyDiscoveryEngine {
100
100
  private topology: CameraTopology | null = null;
101
101
  private llmDevice: ChatCompletionDevice | null = null;
102
102
  private llmSearched: boolean = false;
103
+ private llmProviderType: LlmProvider = 'unknown';
103
104
 
104
105
  // Scene analysis cache (camera ID -> analysis)
105
106
  private sceneCache: Map<string, SceneAnalysis> = new Map();
@@ -177,8 +178,24 @@ export class TopologyDiscoveryEngine {
177
178
  if (!device) continue;
178
179
 
179
180
  if (device.interfaces?.includes('ChatCompletion')) {
181
+ const deviceName = device.name?.toLowerCase() || '';
182
+
183
+ // Detect provider type for image format selection
184
+ if (deviceName.includes('openai') || deviceName.includes('gpt')) {
185
+ this.llmProviderType = 'openai';
186
+ } else if (deviceName.includes('anthropic') || deviceName.includes('claude')) {
187
+ this.llmProviderType = 'anthropic';
188
+ } else if (deviceName.includes('ollama') || deviceName.includes('gemini') ||
189
+ deviceName.includes('google') || deviceName.includes('llama')) {
190
+ // These providers use OpenAI-compatible format
191
+ this.llmProviderType = 'openai';
192
+ } else {
193
+ this.llmProviderType = 'unknown';
194
+ }
195
+
180
196
  this.llmDevice = device as unknown as ChatCompletionDevice;
181
197
  this.console.log(`[Discovery] Connected to LLM: ${device.name}`);
198
+ this.console.log(`[Discovery] Image format: ${this.llmProviderType}`);
182
199
  return this.llmDevice;
183
200
  }
184
201
  }
@@ -237,14 +254,14 @@ export class TopologyDiscoveryEngine {
237
254
  }
238
255
 
239
256
  try {
240
- // Build multimodal message with Anthropic-native format
257
+ // Build multimodal message with provider-specific image format
241
258
  const result = await llm.getChatCompletion({
242
259
  messages: [
243
260
  {
244
261
  role: 'user',
245
262
  content: [
246
263
  { type: 'text', text: SCENE_ANALYSIS_PROMPT },
247
- buildImageContent(imageData),
264
+ buildImageContent(imageData, this.llmProviderType),
248
265
  ],
249
266
  },
250
267
  ],