@blueharford/scrypted-spatial-awareness 0.5.2 → 0.5.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/out/plugin.zip CHANGED
Binary file
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@blueharford/scrypted-spatial-awareness",
3
- "version": "0.5.2",
3
+ "version": "0.5.4",
4
4
  "description": "Cross-camera object tracking for Scrypted NVR with spatial awareness",
5
5
  "author": "Joshua Seidel <blueharford>",
6
6
  "license": "Apache-2.0",
@@ -100,21 +100,60 @@ export async function mediaObjectToBase64(mediaObject: MediaObject): Promise<Ima
100
100
  }
101
101
  }
102
102
 
103
+ /** LLM Provider type for image format selection */
104
+ export type LlmProvider = 'openai' | 'anthropic' | 'unknown';
105
+
103
106
  /**
104
107
  * Build image content block for ChatCompletion API
105
- * Compatible with both OpenAI and Anthropic formats via @scrypted/llm
108
+ * Supports both OpenAI and Anthropic formats
109
+ * @param imageData - Image data with base64 and media type
110
+ * @param provider - The LLM provider type (openai, anthropic, or unknown)
106
111
  */
107
- export function buildImageContent(imageData: ImageData): any {
108
- // Use Anthropic's native format which @scrypted/llm should translate
109
- // This format is more explicit about the base64 data
110
- return {
111
- type: 'image',
112
- source: {
113
- type: 'base64',
114
- media_type: imageData.mediaType,
115
- data: imageData.base64,
116
- },
117
- };
112
+ export function buildImageContent(imageData: ImageData, provider: LlmProvider = 'unknown'): any {
113
+ if (provider === 'openai') {
114
+ // OpenAI format: uses data URL with image_url wrapper
115
+ // Include detail parameter for compatibility
116
+ return {
117
+ type: 'image_url',
118
+ image_url: {
119
+ url: `data:${imageData.mediaType};base64,${imageData.base64}`,
120
+ detail: 'auto',
121
+ },
122
+ };
123
+ } else if (provider === 'anthropic') {
124
+ // Anthropic format: uses separate base64 data and media_type
125
+ return {
126
+ type: 'image',
127
+ source: {
128
+ type: 'base64',
129
+ media_type: imageData.mediaType,
130
+ data: imageData.base64,
131
+ },
132
+ };
133
+ } else {
134
+ // Unknown provider: try Anthropic format first as it's more explicit
135
+ // Some plugins may translate this to OpenAI format internally
136
+ return {
137
+ type: 'image',
138
+ source: {
139
+ type: 'base64',
140
+ media_type: imageData.mediaType,
141
+ data: imageData.base64,
142
+ },
143
+ };
144
+ }
145
+ }
146
+
147
+ /** Check if an error indicates vision/multimodal content is not supported */
148
+ export function isVisionNotSupportedError(error: any): boolean {
149
+ const errorStr = String(error);
150
+ return (
151
+ errorStr.includes('content.str') ||
152
+ errorStr.includes('should be a valid string') ||
153
+ errorStr.includes('Invalid content type') ||
154
+ errorStr.includes('does not support vision') ||
155
+ errorStr.includes('image_url') && errorStr.includes('not supported')
156
+ );
118
157
  }
119
158
 
120
159
  export class SpatialReasoningEngine {
@@ -361,6 +400,7 @@ export class SpatialReasoningEngine {
361
400
 
362
401
  private llmSearched: boolean = false;
363
402
  private llmProvider: string | null = null;
403
+ private llmProviderType: LlmProvider = 'unknown';
364
404
 
365
405
  /** Find or initialize LLM device - looks for ChatCompletion interface from @scrypted/llm plugin */
366
406
  private async findLlmDevice(): Promise<ChatCompletionDevice | null> {
@@ -381,27 +421,36 @@ export class SpatialReasoningEngine {
381
421
  const deviceName = device.name?.toLowerCase() || '';
382
422
  const pluginId = (device as any).pluginId?.toLowerCase() || '';
383
423
 
384
- // Identify the provider type for logging
424
+ // Identify the provider type for logging and image format selection
385
425
  let providerType = 'Unknown';
386
- if (pluginId.includes('@scrypted/llm') || pluginId.includes('llm')) {
387
- providerType = 'Scrypted LLM';
388
- }
426
+ let providerTypeEnum: LlmProvider = 'unknown';
427
+
389
428
  if (deviceName.includes('openai') || deviceName.includes('gpt')) {
390
429
  providerType = 'OpenAI';
430
+ providerTypeEnum = 'openai';
391
431
  } else if (deviceName.includes('anthropic') || deviceName.includes('claude')) {
392
432
  providerType = 'Anthropic';
433
+ providerTypeEnum = 'anthropic';
393
434
  } else if (deviceName.includes('ollama')) {
394
435
  providerType = 'Ollama';
436
+ providerTypeEnum = 'openai'; // Ollama uses OpenAI-compatible format
395
437
  } else if (deviceName.includes('gemini') || deviceName.includes('google')) {
396
438
  providerType = 'Google';
439
+ providerTypeEnum = 'openai'; // Google uses OpenAI-compatible format
397
440
  } else if (deviceName.includes('llama')) {
398
441
  providerType = 'llama.cpp';
442
+ providerTypeEnum = 'openai'; // llama.cpp uses OpenAI-compatible format
443
+ } else if (pluginId.includes('@scrypted/llm') || pluginId.includes('llm')) {
444
+ providerType = 'Scrypted LLM';
445
+ providerTypeEnum = 'unknown';
399
446
  }
400
447
 
401
448
  this.llmDevice = device as unknown as ChatCompletionDevice;
402
449
  this.llmProvider = `${providerType} (${device.name})`;
450
+ this.llmProviderType = providerTypeEnum;
403
451
  this.console.log(`[LLM] Connected to ${providerType}: ${device.name}`);
404
452
  this.console.log(`[LLM] Plugin: ${pluginId || 'N/A'}`);
453
+ this.console.log(`[LLM] Image format: ${providerTypeEnum}`);
405
454
  this.console.log(`[LLM] Interfaces: ${device.interfaces?.join(', ')}`);
406
455
  return this.llmDevice;
407
456
  }
@@ -423,6 +472,11 @@ export class SpatialReasoningEngine {
423
472
  return this.llmProvider;
424
473
  }
425
474
 
475
+ /** Get the current LLM provider type for image format selection */
476
+ getLlmProviderType(): LlmProvider {
477
+ return this.llmProviderType;
478
+ }
479
+
426
480
  /** Check if LLM is available */
427
481
  isLlmAvailable(): boolean {
428
482
  return this.llmDevice !== null;
@@ -801,10 +855,10 @@ export class SpatialReasoningEngine {
801
855
  // Build message content - use multimodal format if we have an image
802
856
  let messageContent: any;
803
857
  if (imageData) {
804
- // Vision-capable multimodal message format (Anthropic native format)
858
+ // Vision-capable multimodal message format (provider-specific)
805
859
  messageContent = [
806
860
  { type: 'text', text: prompt },
807
- buildImageContent(imageData),
861
+ buildImageContent(imageData, this.llmProviderType),
808
862
  ];
809
863
  } else {
810
864
  // Fallback to text-only if image conversion failed
@@ -906,10 +960,10 @@ If no clear landmark is identifiable, respond with: {"name": null}`;
906
960
  // Build message content - use multimodal format if we have an image
907
961
  let messageContent: any;
908
962
  if (imageData) {
909
- // Vision-capable multimodal message format (Anthropic native format)
963
+ // Vision-capable multimodal message format (provider-specific)
910
964
  messageContent = [
911
965
  { type: 'text', text: prompt },
912
- buildImageContent(imageData),
966
+ buildImageContent(imageData, this.llmProviderType),
913
967
  ];
914
968
  } else {
915
969
  // Fallback to text-only if image conversion failed
@@ -30,7 +30,7 @@ import {
30
30
  Landmark,
31
31
  findCamera,
32
32
  } from '../models/topology';
33
- import { mediaObjectToBase64, buildImageContent, ImageData } from './spatial-reasoning';
33
+ import { mediaObjectToBase64, buildImageContent, ImageData, LlmProvider, isVisionNotSupportedError } from './spatial-reasoning';
34
34
 
35
35
  const { systemManager } = sdk;
36
36
 
@@ -100,6 +100,7 @@ export class TopologyDiscoveryEngine {
100
100
  private topology: CameraTopology | null = null;
101
101
  private llmDevice: ChatCompletionDevice | null = null;
102
102
  private llmSearched: boolean = false;
103
+ private llmProviderType: LlmProvider = 'unknown';
103
104
 
104
105
  // Scene analysis cache (camera ID -> analysis)
105
106
  private sceneCache: Map<string, SceneAnalysis> = new Map();
@@ -177,8 +178,24 @@ export class TopologyDiscoveryEngine {
177
178
  if (!device) continue;
178
179
 
179
180
  if (device.interfaces?.includes('ChatCompletion')) {
181
+ const deviceName = device.name?.toLowerCase() || '';
182
+
183
+ // Detect provider type for image format selection
184
+ if (deviceName.includes('openai') || deviceName.includes('gpt')) {
185
+ this.llmProviderType = 'openai';
186
+ } else if (deviceName.includes('anthropic') || deviceName.includes('claude')) {
187
+ this.llmProviderType = 'anthropic';
188
+ } else if (deviceName.includes('ollama') || deviceName.includes('gemini') ||
189
+ deviceName.includes('google') || deviceName.includes('llama')) {
190
+ // These providers use OpenAI-compatible format
191
+ this.llmProviderType = 'openai';
192
+ } else {
193
+ this.llmProviderType = 'unknown';
194
+ }
195
+
180
196
  this.llmDevice = device as unknown as ChatCompletionDevice;
181
197
  this.console.log(`[Discovery] Connected to LLM: ${device.name}`);
198
+ this.console.log(`[Discovery] Image format: ${this.llmProviderType}`);
182
199
  return this.llmDevice;
183
200
  }
184
201
  }
@@ -236,77 +253,125 @@ export class TopologyDiscoveryEngine {
236
253
  return analysis;
237
254
  }
238
255
 
239
- try {
240
- // Build multimodal message with Anthropic-native format
241
- const result = await llm.getChatCompletion({
242
- messages: [
243
- {
244
- role: 'user',
245
- content: [
246
- { type: 'text', text: SCENE_ANALYSIS_PROMPT },
247
- buildImageContent(imageData),
248
- ],
249
- },
250
- ],
251
- max_tokens: 500,
252
- temperature: 0.3,
253
- });
254
-
255
- const content = result?.choices?.[0]?.message?.content;
256
- if (content && typeof content === 'string') {
257
- try {
258
- // Extract JSON from response (handle markdown code blocks)
259
- let jsonStr = content.trim();
260
- if (jsonStr.startsWith('```')) {
261
- jsonStr = jsonStr.replace(/```json?\n?/g, '').replace(/```$/g, '').trim();
262
- }
263
-
264
- const parsed = JSON.parse(jsonStr);
265
-
266
- // Map parsed data to our types
267
- if (Array.isArray(parsed.landmarks)) {
268
- analysis.landmarks = parsed.landmarks.map((l: any) => ({
269
- name: l.name || 'Unknown',
270
- type: this.mapLandmarkType(l.type),
271
- confidence: typeof l.confidence === 'number' ? l.confidence : 0.7,
272
- description: l.description || '',
273
- boundingBox: l.boundingBox,
274
- }));
275
- }
256
+ // Try with detected provider format first, then fallback to alternate format
257
+ const formatsToTry: LlmProvider[] = [this.llmProviderType];
258
+
259
+ // Add fallback format
260
+ if (this.llmProviderType === 'openai') {
261
+ formatsToTry.push('anthropic');
262
+ } else if (this.llmProviderType === 'anthropic') {
263
+ formatsToTry.push('openai');
264
+ } else {
265
+ // Unknown - try both
266
+ formatsToTry.push('openai');
267
+ }
276
268
 
277
- if (Array.isArray(parsed.zones)) {
278
- analysis.zones = parsed.zones.map((z: any) => ({
279
- name: z.name || 'Unknown',
280
- type: this.mapZoneType(z.type),
281
- coverage: typeof z.coverage === 'number' ? z.coverage : 0.5,
282
- description: z.description || '',
283
- boundingBox: z.boundingBox,
284
- }));
269
+ let lastError: any = null;
270
+
271
+ for (const formatType of formatsToTry) {
272
+ try {
273
+ this.console.log(`[Discovery] Trying ${formatType} image format for ${cameraName}...`);
274
+
275
+ // Build multimodal message with provider-specific image format
276
+ const result = await llm.getChatCompletion({
277
+ messages: [
278
+ {
279
+ role: 'user',
280
+ content: [
281
+ { type: 'text', text: SCENE_ANALYSIS_PROMPT },
282
+ buildImageContent(imageData, formatType),
283
+ ],
284
+ },
285
+ ],
286
+ max_tokens: 500,
287
+ temperature: 0.3,
288
+ });
289
+
290
+ const content = result?.choices?.[0]?.message?.content;
291
+ if (content && typeof content === 'string') {
292
+ try {
293
+ // Extract JSON from response (handle markdown code blocks)
294
+ let jsonStr = content.trim();
295
+ if (jsonStr.startsWith('```')) {
296
+ jsonStr = jsonStr.replace(/```json?\n?/g, '').replace(/```$/g, '').trim();
297
+ }
298
+
299
+ const parsed = JSON.parse(jsonStr);
300
+
301
+ // Map parsed data to our types
302
+ if (Array.isArray(parsed.landmarks)) {
303
+ analysis.landmarks = parsed.landmarks.map((l: any) => ({
304
+ name: l.name || 'Unknown',
305
+ type: this.mapLandmarkType(l.type),
306
+ confidence: typeof l.confidence === 'number' ? l.confidence : 0.7,
307
+ description: l.description || '',
308
+ boundingBox: l.boundingBox,
309
+ }));
310
+ }
311
+
312
+ if (Array.isArray(parsed.zones)) {
313
+ analysis.zones = parsed.zones.map((z: any) => ({
314
+ name: z.name || 'Unknown',
315
+ type: this.mapZoneType(z.type),
316
+ coverage: typeof z.coverage === 'number' ? z.coverage : 0.5,
317
+ description: z.description || '',
318
+ boundingBox: z.boundingBox,
319
+ }));
320
+ }
321
+
322
+ if (parsed.edges && typeof parsed.edges === 'object') {
323
+ analysis.edges = {
324
+ top: parsed.edges.top || '',
325
+ left: parsed.edges.left || '',
326
+ right: parsed.edges.right || '',
327
+ bottom: parsed.edges.bottom || '',
328
+ };
329
+ }
330
+
331
+ if (parsed.orientation) {
332
+ analysis.orientation = this.mapOrientation(parsed.orientation);
333
+ }
334
+
335
+ analysis.isValid = true;
336
+ this.console.log(`[Discovery] Analyzed ${cameraName}: ${analysis.landmarks.length} landmarks, ${analysis.zones.length} zones (using ${formatType} format)`);
337
+
338
+ // Update the preferred format for future requests
339
+ if (formatType !== this.llmProviderType) {
340
+ this.console.log(`[Discovery] Switching to ${formatType} format for future requests`);
341
+ this.llmProviderType = formatType;
342
+ }
343
+
344
+ // Success - exit the retry loop
345
+ return analysis;
346
+ } catch (parseError) {
347
+ this.console.warn(`[Discovery] Failed to parse LLM response for ${cameraName}:`, parseError);
348
+ analysis.error = 'Failed to parse LLM response';
349
+ return analysis;
285
350
  }
351
+ }
352
+ } catch (e) {
353
+ lastError = e;
286
354
 
287
- if (parsed.edges && typeof parsed.edges === 'object') {
288
- analysis.edges = {
289
- top: parsed.edges.top || '',
290
- left: parsed.edges.left || '',
291
- right: parsed.edges.right || '',
292
- bottom: parsed.edges.bottom || '',
293
- };
294
- }
355
+ // Check if this is a vision/multimodal format error
356
+ if (isVisionNotSupportedError(e)) {
357
+ this.console.warn(`[Discovery] ${formatType} format not supported, trying fallback...`);
358
+ continue; // Try next format
359
+ }
295
360
 
296
- if (parsed.orientation) {
297
- analysis.orientation = this.mapOrientation(parsed.orientation);
298
- }
361
+ // Not a format error - don't retry
362
+ this.console.warn(`[Discovery] Scene analysis failed for ${cameraName}:`, e);
363
+ break;
364
+ }
365
+ }
299
366
 
300
- analysis.isValid = true;
301
- this.console.log(`[Discovery] Analyzed ${cameraName}: ${analysis.landmarks.length} landmarks, ${analysis.zones.length} zones`);
302
- } catch (parseError) {
303
- this.console.warn(`[Discovery] Failed to parse LLM response for ${cameraName}:`, parseError);
304
- analysis.error = 'Failed to parse LLM response';
305
- }
367
+ // All formats failed
368
+ if (lastError) {
369
+ const errorStr = String(lastError);
370
+ if (isVisionNotSupportedError(lastError)) {
371
+ analysis.error = 'Vision/image analysis not supported by configured LLM. Ensure you have a vision-capable model (e.g., gpt-4o, gpt-4-turbo, claude-3-sonnet) configured.';
372
+ } else {
373
+ analysis.error = `Analysis failed: ${errorStr}`;
306
374
  }
307
- } catch (e) {
308
- this.console.warn(`[Discovery] Scene analysis failed for ${cameraName}:`, e);
309
- analysis.error = `Analysis failed: ${e}`;
310
375
  }
311
376
 
312
377
  // Cache the analysis