@blueharford/scrypted-spatial-awareness 0.5.3 → 0.5.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/out/plugin.zip CHANGED
Binary file
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@blueharford/scrypted-spatial-awareness",
3
- "version": "0.5.3",
3
+ "version": "0.5.4",
4
4
  "description": "Cross-camera object tracking for Scrypted NVR with spatial awareness",
5
5
  "author": "Joshua Seidel <blueharford>",
6
6
  "license": "Apache-2.0",
@@ -112,10 +112,12 @@ export type LlmProvider = 'openai' | 'anthropic' | 'unknown';
112
112
  export function buildImageContent(imageData: ImageData, provider: LlmProvider = 'unknown'): any {
113
113
  if (provider === 'openai') {
114
114
  // OpenAI format: uses data URL with image_url wrapper
115
+ // Include detail parameter for compatibility
115
116
  return {
116
117
  type: 'image_url',
117
118
  image_url: {
118
119
  url: `data:${imageData.mediaType};base64,${imageData.base64}`,
120
+ detail: 'auto',
119
121
  },
120
122
  };
121
123
  } else if (provider === 'anthropic') {
@@ -129,17 +131,31 @@ export function buildImageContent(imageData: ImageData, provider: LlmProvider =
129
131
  },
130
132
  };
131
133
  } else {
132
- // Unknown provider: try OpenAI format as it's more commonly supported
133
- // Most LLM wrappers (including @scrypted/llm) understand the OpenAI format
134
+ // Unknown provider: try Anthropic format first as it's more explicit
135
+ // Some plugins may translate this to OpenAI format internally
134
136
  return {
135
- type: 'image_url',
136
- image_url: {
137
- url: `data:${imageData.mediaType};base64,${imageData.base64}`,
137
+ type: 'image',
138
+ source: {
139
+ type: 'base64',
140
+ media_type: imageData.mediaType,
141
+ data: imageData.base64,
138
142
  },
139
143
  };
140
144
  }
141
145
  }
142
146
 
147
+ /** Check if an error indicates vision/multimodal content is not supported */
148
+ export function isVisionNotSupportedError(error: any): boolean {
149
+ const errorStr = String(error);
150
+ return (
151
+ errorStr.includes('content.str') ||
152
+ errorStr.includes('should be a valid string') ||
153
+ errorStr.includes('Invalid content type') ||
154
+ errorStr.includes('does not support vision') ||
155
+ errorStr.includes('image_url') && errorStr.includes('not supported')
156
+ );
157
+ }
158
+
143
159
  export class SpatialReasoningEngine {
144
160
  private config: SpatialReasoningConfig;
145
161
  private console: Console;
@@ -30,7 +30,7 @@ import {
30
30
  Landmark,
31
31
  findCamera,
32
32
  } from '../models/topology';
33
- import { mediaObjectToBase64, buildImageContent, ImageData, LlmProvider } from './spatial-reasoning';
33
+ import { mediaObjectToBase64, buildImageContent, ImageData, LlmProvider, isVisionNotSupportedError } from './spatial-reasoning';
34
34
 
35
35
  const { systemManager } = sdk;
36
36
 
@@ -253,77 +253,125 @@ export class TopologyDiscoveryEngine {
253
253
  return analysis;
254
254
  }
255
255
 
256
- try {
257
- // Build multimodal message with provider-specific image format
258
- const result = await llm.getChatCompletion({
259
- messages: [
260
- {
261
- role: 'user',
262
- content: [
263
- { type: 'text', text: SCENE_ANALYSIS_PROMPT },
264
- buildImageContent(imageData, this.llmProviderType),
265
- ],
266
- },
267
- ],
268
- max_tokens: 500,
269
- temperature: 0.3,
270
- });
271
-
272
- const content = result?.choices?.[0]?.message?.content;
273
- if (content && typeof content === 'string') {
274
- try {
275
- // Extract JSON from response (handle markdown code blocks)
276
- let jsonStr = content.trim();
277
- if (jsonStr.startsWith('```')) {
278
- jsonStr = jsonStr.replace(/```json?\n?/g, '').replace(/```$/g, '').trim();
279
- }
280
-
281
- const parsed = JSON.parse(jsonStr);
282
-
283
- // Map parsed data to our types
284
- if (Array.isArray(parsed.landmarks)) {
285
- analysis.landmarks = parsed.landmarks.map((l: any) => ({
286
- name: l.name || 'Unknown',
287
- type: this.mapLandmarkType(l.type),
288
- confidence: typeof l.confidence === 'number' ? l.confidence : 0.7,
289
- description: l.description || '',
290
- boundingBox: l.boundingBox,
291
- }));
292
- }
256
+ // Try with detected provider format first, then fallback to alternate format
257
+ const formatsToTry: LlmProvider[] = [this.llmProviderType];
258
+
259
+ // Add fallback format
260
+ if (this.llmProviderType === 'openai') {
261
+ formatsToTry.push('anthropic');
262
+ } else if (this.llmProviderType === 'anthropic') {
263
+ formatsToTry.push('openai');
264
+ } else {
265
+ // Unknown - try both
266
+ formatsToTry.push('openai');
267
+ }
293
268
 
294
- if (Array.isArray(parsed.zones)) {
295
- analysis.zones = parsed.zones.map((z: any) => ({
296
- name: z.name || 'Unknown',
297
- type: this.mapZoneType(z.type),
298
- coverage: typeof z.coverage === 'number' ? z.coverage : 0.5,
299
- description: z.description || '',
300
- boundingBox: z.boundingBox,
301
- }));
269
+ let lastError: any = null;
270
+
271
+ for (const formatType of formatsToTry) {
272
+ try {
273
+ this.console.log(`[Discovery] Trying ${formatType} image format for ${cameraName}...`);
274
+
275
+ // Build multimodal message with provider-specific image format
276
+ const result = await llm.getChatCompletion({
277
+ messages: [
278
+ {
279
+ role: 'user',
280
+ content: [
281
+ { type: 'text', text: SCENE_ANALYSIS_PROMPT },
282
+ buildImageContent(imageData, formatType),
283
+ ],
284
+ },
285
+ ],
286
+ max_tokens: 500,
287
+ temperature: 0.3,
288
+ });
289
+
290
+ const content = result?.choices?.[0]?.message?.content;
291
+ if (content && typeof content === 'string') {
292
+ try {
293
+ // Extract JSON from response (handle markdown code blocks)
294
+ let jsonStr = content.trim();
295
+ if (jsonStr.startsWith('```')) {
296
+ jsonStr = jsonStr.replace(/```json?\n?/g, '').replace(/```$/g, '').trim();
297
+ }
298
+
299
+ const parsed = JSON.parse(jsonStr);
300
+
301
+ // Map parsed data to our types
302
+ if (Array.isArray(parsed.landmarks)) {
303
+ analysis.landmarks = parsed.landmarks.map((l: any) => ({
304
+ name: l.name || 'Unknown',
305
+ type: this.mapLandmarkType(l.type),
306
+ confidence: typeof l.confidence === 'number' ? l.confidence : 0.7,
307
+ description: l.description || '',
308
+ boundingBox: l.boundingBox,
309
+ }));
310
+ }
311
+
312
+ if (Array.isArray(parsed.zones)) {
313
+ analysis.zones = parsed.zones.map((z: any) => ({
314
+ name: z.name || 'Unknown',
315
+ type: this.mapZoneType(z.type),
316
+ coverage: typeof z.coverage === 'number' ? z.coverage : 0.5,
317
+ description: z.description || '',
318
+ boundingBox: z.boundingBox,
319
+ }));
320
+ }
321
+
322
+ if (parsed.edges && typeof parsed.edges === 'object') {
323
+ analysis.edges = {
324
+ top: parsed.edges.top || '',
325
+ left: parsed.edges.left || '',
326
+ right: parsed.edges.right || '',
327
+ bottom: parsed.edges.bottom || '',
328
+ };
329
+ }
330
+
331
+ if (parsed.orientation) {
332
+ analysis.orientation = this.mapOrientation(parsed.orientation);
333
+ }
334
+
335
+ analysis.isValid = true;
336
+ this.console.log(`[Discovery] Analyzed ${cameraName}: ${analysis.landmarks.length} landmarks, ${analysis.zones.length} zones (using ${formatType} format)`);
337
+
338
+ // Update the preferred format for future requests
339
+ if (formatType !== this.llmProviderType) {
340
+ this.console.log(`[Discovery] Switching to ${formatType} format for future requests`);
341
+ this.llmProviderType = formatType;
342
+ }
343
+
344
+ // Success - exit the retry loop
345
+ return analysis;
346
+ } catch (parseError) {
347
+ this.console.warn(`[Discovery] Failed to parse LLM response for ${cameraName}:`, parseError);
348
+ analysis.error = 'Failed to parse LLM response';
349
+ return analysis;
302
350
  }
351
+ }
352
+ } catch (e) {
353
+ lastError = e;
303
354
 
304
- if (parsed.edges && typeof parsed.edges === 'object') {
305
- analysis.edges = {
306
- top: parsed.edges.top || '',
307
- left: parsed.edges.left || '',
308
- right: parsed.edges.right || '',
309
- bottom: parsed.edges.bottom || '',
310
- };
311
- }
355
+ // Check if this is a vision/multimodal format error
356
+ if (isVisionNotSupportedError(e)) {
357
+ this.console.warn(`[Discovery] ${formatType} format not supported, trying fallback...`);
358
+ continue; // Try next format
359
+ }
312
360
 
313
- if (parsed.orientation) {
314
- analysis.orientation = this.mapOrientation(parsed.orientation);
315
- }
361
+ // Not a format error - don't retry
362
+ this.console.warn(`[Discovery] Scene analysis failed for ${cameraName}:`, e);
363
+ break;
364
+ }
365
+ }
316
366
 
317
- analysis.isValid = true;
318
- this.console.log(`[Discovery] Analyzed ${cameraName}: ${analysis.landmarks.length} landmarks, ${analysis.zones.length} zones`);
319
- } catch (parseError) {
320
- this.console.warn(`[Discovery] Failed to parse LLM response for ${cameraName}:`, parseError);
321
- analysis.error = 'Failed to parse LLM response';
322
- }
367
+ // All formats failed
368
+ if (lastError) {
369
+ const errorStr = String(lastError);
370
+ if (isVisionNotSupportedError(lastError)) {
371
+ analysis.error = 'Vision/image analysis not supported by configured LLM. Ensure you have a vision-capable model (e.g., gpt-4o, gpt-4-turbo, claude-3-sonnet) configured.';
372
+ } else {
373
+ analysis.error = `Analysis failed: ${errorStr}`;
323
374
  }
324
- } catch (e) {
325
- this.console.warn(`[Discovery] Scene analysis failed for ${cameraName}:`, e);
326
- analysis.error = `Analysis failed: ${e}`;
327
375
  }
328
376
 
329
377
  // Cache the analysis