@blueharford/scrypted-spatial-awareness 0.5.3 → 0.5.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/out/plugin.zip CHANGED
Binary file
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@blueharford/scrypted-spatial-awareness",
3
- "version": "0.5.3",
3
+ "version": "0.5.5",
4
4
  "description": "Cross-camera object tracking for Scrypted NVR with spatial awareness",
5
5
  "author": "Joshua Seidel <blueharford>",
6
6
  "license": "Apache-2.0",
@@ -87,12 +87,25 @@ export async function mediaObjectToBase64(mediaObject: MediaObject): Promise<Ima
87
87
  // Convert MediaObject to Buffer using mediaManager
88
88
  const buffer = await mediaManager.convertMediaObjectToBuffer(mediaObject, ScryptedMimeTypes.Image);
89
89
 
90
+ if (!buffer || buffer.length === 0) {
91
+ console.warn('Failed to convert MediaObject: empty buffer');
92
+ return null;
93
+ }
94
+
90
95
  // Convert buffer to base64 (raw, no data URL prefix)
91
96
  const base64 = buffer.toString('base64');
92
97
 
98
+ // Validate base64 - check it's not empty and looks valid
99
+ if (!base64 || base64.length < 100) {
100
+ console.warn(`Invalid base64: length=${base64?.length || 0}`);
101
+ return null;
102
+ }
103
+
93
104
  // Determine MIME type - default to JPEG for camera images
94
105
  const mediaType = mediaObject.mimeType?.split(';')[0] || 'image/jpeg';
95
106
 
107
+ console.log(`[Image] Converted to base64: ${base64.length} chars, type=${mediaType}`);
108
+
96
109
  return { base64, mediaType };
97
110
  } catch (e) {
98
111
  console.warn('Failed to convert MediaObject to base64:', e);
@@ -101,13 +114,13 @@ export async function mediaObjectToBase64(mediaObject: MediaObject): Promise<Ima
101
114
  }
102
115
 
103
116
  /** LLM Provider type for image format selection */
104
- export type LlmProvider = 'openai' | 'anthropic' | 'unknown';
117
+ export type LlmProvider = 'openai' | 'anthropic' | 'scrypted' | 'unknown';
105
118
 
106
119
  /**
107
120
  * Build image content block for ChatCompletion API
108
- * Supports both OpenAI and Anthropic formats
121
+ * Supports OpenAI, Anthropic, and @scrypted/llm formats
109
122
  * @param imageData - Image data with base64 and media type
110
- * @param provider - The LLM provider type (openai, anthropic, or unknown)
123
+ * @param provider - The LLM provider type
111
124
  */
112
125
  export function buildImageContent(imageData: ImageData, provider: LlmProvider = 'unknown'): any {
113
126
  if (provider === 'openai') {
@@ -116,10 +129,11 @@ export function buildImageContent(imageData: ImageData, provider: LlmProvider =
116
129
  type: 'image_url',
117
130
  image_url: {
118
131
  url: `data:${imageData.mediaType};base64,${imageData.base64}`,
132
+ detail: 'auto',
119
133
  },
120
134
  };
121
135
  } else if (provider === 'anthropic') {
122
- // Anthropic format: uses separate base64 data and media_type
136
+ // Anthropic official format: uses 'data' key
123
137
  return {
124
138
  type: 'image',
125
139
  source: {
@@ -128,18 +142,46 @@ export function buildImageContent(imageData: ImageData, provider: LlmProvider =
128
142
  data: imageData.base64,
129
143
  },
130
144
  };
145
+ } else if (provider === 'scrypted') {
146
+ // @scrypted/llm format: uses 'base64' key (per error path .image.source.base64)
147
+ return {
148
+ type: 'image',
149
+ source: {
150
+ type: 'base64',
151
+ media_type: imageData.mediaType,
152
+ base64: imageData.base64,
153
+ },
154
+ };
131
155
  } else {
132
- // Unknown provider: try OpenAI format as it's more commonly supported
133
- // Most LLM wrappers (including @scrypted/llm) understand the OpenAI format
156
+ // Unknown provider: try @scrypted/llm format first
134
157
  return {
135
- type: 'image_url',
136
- image_url: {
137
- url: `data:${imageData.mediaType};base64,${imageData.base64}`,
158
+ type: 'image',
159
+ source: {
160
+ type: 'base64',
161
+ media_type: imageData.mediaType,
162
+ base64: imageData.base64,
138
163
  },
139
164
  };
140
165
  }
141
166
  }
142
167
 
168
+ /** Check if an error indicates vision/multimodal content format issue (should try alternate format) */
169
+ export function isVisionFormatError(error: any): boolean {
170
+ const errorStr = String(error);
171
+ return (
172
+ errorStr.includes('content.str') ||
173
+ errorStr.includes('should be a valid string') ||
174
+ errorStr.includes('Invalid content type') ||
175
+ errorStr.includes('does not support vision') ||
176
+ errorStr.includes('invalid base64') ||
177
+ errorStr.includes('Invalid base64') ||
178
+ errorStr.includes('.image.source') ||
179
+ errorStr.includes('.image_url') ||
180
+ (errorStr.includes('image_url') && errorStr.includes('not supported')) ||
181
+ (errorStr.includes('400') && errorStr.includes('content'))
182
+ );
183
+ }
184
+
143
185
  export class SpatialReasoningEngine {
144
186
  private config: SpatialReasoningConfig;
145
187
  private console: Console;
@@ -30,7 +30,7 @@ import {
30
30
  Landmark,
31
31
  findCamera,
32
32
  } from '../models/topology';
33
- import { mediaObjectToBase64, buildImageContent, ImageData, LlmProvider } from './spatial-reasoning';
33
+ import { mediaObjectToBase64, buildImageContent, ImageData, LlmProvider, isVisionFormatError } from './spatial-reasoning';
34
34
 
35
35
  const { systemManager } = sdk;
36
36
 
@@ -253,77 +253,131 @@ export class TopologyDiscoveryEngine {
253
253
  return analysis;
254
254
  }
255
255
 
256
- try {
257
- // Build multimodal message with provider-specific image format
258
- const result = await llm.getChatCompletion({
259
- messages: [
260
- {
261
- role: 'user',
262
- content: [
263
- { type: 'text', text: SCENE_ANALYSIS_PROMPT },
264
- buildImageContent(imageData, this.llmProviderType),
265
- ],
266
- },
267
- ],
268
- max_tokens: 500,
269
- temperature: 0.3,
270
- });
271
-
272
- const content = result?.choices?.[0]?.message?.content;
273
- if (content && typeof content === 'string') {
274
- try {
275
- // Extract JSON from response (handle markdown code blocks)
276
- let jsonStr = content.trim();
277
- if (jsonStr.startsWith('```')) {
278
- jsonStr = jsonStr.replace(/```json?\n?/g, '').replace(/```$/g, '').trim();
279
- }
280
-
281
- const parsed = JSON.parse(jsonStr);
282
-
283
- // Map parsed data to our types
284
- if (Array.isArray(parsed.landmarks)) {
285
- analysis.landmarks = parsed.landmarks.map((l: any) => ({
286
- name: l.name || 'Unknown',
287
- type: this.mapLandmarkType(l.type),
288
- confidence: typeof l.confidence === 'number' ? l.confidence : 0.7,
289
- description: l.description || '',
290
- boundingBox: l.boundingBox,
291
- }));
292
- }
256
+ // Try with detected provider format first, then fallback to alternates
257
+ // The order matters: try the most likely formats first
258
+ const formatsToTry: LlmProvider[] = [];
259
+
260
+ // Start with detected format
261
+ formatsToTry.push(this.llmProviderType);
262
+
263
+ // Add fallbacks based on detected provider
264
+ if (this.llmProviderType === 'openai') {
265
+ formatsToTry.push('scrypted', 'anthropic');
266
+ } else if (this.llmProviderType === 'anthropic') {
267
+ formatsToTry.push('scrypted', 'openai');
268
+ } else if (this.llmProviderType === 'scrypted') {
269
+ formatsToTry.push('anthropic', 'openai');
270
+ } else {
271
+ // Unknown - try all formats
272
+ formatsToTry.push('scrypted', 'anthropic', 'openai');
273
+ }
293
274
 
294
- if (Array.isArray(parsed.zones)) {
295
- analysis.zones = parsed.zones.map((z: any) => ({
296
- name: z.name || 'Unknown',
297
- type: this.mapZoneType(z.type),
298
- coverage: typeof z.coverage === 'number' ? z.coverage : 0.5,
299
- description: z.description || '',
300
- boundingBox: z.boundingBox,
301
- }));
275
+ let lastError: any = null;
276
+
277
+ for (const formatType of formatsToTry) {
278
+ try {
279
+ this.console.log(`[Discovery] Trying ${formatType} image format for ${cameraName}...`);
280
+
281
+ // Build multimodal message with provider-specific image format
282
+ const result = await llm.getChatCompletion({
283
+ messages: [
284
+ {
285
+ role: 'user',
286
+ content: [
287
+ { type: 'text', text: SCENE_ANALYSIS_PROMPT },
288
+ buildImageContent(imageData, formatType),
289
+ ],
290
+ },
291
+ ],
292
+ max_tokens: 500,
293
+ temperature: 0.3,
294
+ });
295
+
296
+ const content = result?.choices?.[0]?.message?.content;
297
+ if (content && typeof content === 'string') {
298
+ try {
299
+ // Extract JSON from response (handle markdown code blocks)
300
+ let jsonStr = content.trim();
301
+ if (jsonStr.startsWith('```')) {
302
+ jsonStr = jsonStr.replace(/```json?\n?/g, '').replace(/```$/g, '').trim();
303
+ }
304
+
305
+ const parsed = JSON.parse(jsonStr);
306
+
307
+ // Map parsed data to our types
308
+ if (Array.isArray(parsed.landmarks)) {
309
+ analysis.landmarks = parsed.landmarks.map((l: any) => ({
310
+ name: l.name || 'Unknown',
311
+ type: this.mapLandmarkType(l.type),
312
+ confidence: typeof l.confidence === 'number' ? l.confidence : 0.7,
313
+ description: l.description || '',
314
+ boundingBox: l.boundingBox,
315
+ }));
316
+ }
317
+
318
+ if (Array.isArray(parsed.zones)) {
319
+ analysis.zones = parsed.zones.map((z: any) => ({
320
+ name: z.name || 'Unknown',
321
+ type: this.mapZoneType(z.type),
322
+ coverage: typeof z.coverage === 'number' ? z.coverage : 0.5,
323
+ description: z.description || '',
324
+ boundingBox: z.boundingBox,
325
+ }));
326
+ }
327
+
328
+ if (parsed.edges && typeof parsed.edges === 'object') {
329
+ analysis.edges = {
330
+ top: parsed.edges.top || '',
331
+ left: parsed.edges.left || '',
332
+ right: parsed.edges.right || '',
333
+ bottom: parsed.edges.bottom || '',
334
+ };
335
+ }
336
+
337
+ if (parsed.orientation) {
338
+ analysis.orientation = this.mapOrientation(parsed.orientation);
339
+ }
340
+
341
+ analysis.isValid = true;
342
+ this.console.log(`[Discovery] Analyzed ${cameraName}: ${analysis.landmarks.length} landmarks, ${analysis.zones.length} zones (using ${formatType} format)`);
343
+
344
+ // Update the preferred format for future requests
345
+ if (formatType !== this.llmProviderType) {
346
+ this.console.log(`[Discovery] Switching to ${formatType} format for future requests`);
347
+ this.llmProviderType = formatType;
348
+ }
349
+
350
+ // Success - exit the retry loop
351
+ return analysis;
352
+ } catch (parseError) {
353
+ this.console.warn(`[Discovery] Failed to parse LLM response for ${cameraName}:`, parseError);
354
+ analysis.error = 'Failed to parse LLM response';
355
+ return analysis;
302
356
  }
357
+ }
358
+ } catch (e) {
359
+ lastError = e;
303
360
 
304
- if (parsed.edges && typeof parsed.edges === 'object') {
305
- analysis.edges = {
306
- top: parsed.edges.top || '',
307
- left: parsed.edges.left || '',
308
- right: parsed.edges.right || '',
309
- bottom: parsed.edges.bottom || '',
310
- };
311
- }
361
+ // Check if this is a vision/multimodal format error
362
+ if (isVisionFormatError(e)) {
363
+ this.console.warn(`[Discovery] ${formatType} format failed, trying fallback...`);
364
+ continue; // Try next format
365
+ }
312
366
 
313
- if (parsed.orientation) {
314
- analysis.orientation = this.mapOrientation(parsed.orientation);
315
- }
367
+ // Not a format error - don't retry
368
+ this.console.warn(`[Discovery] Scene analysis failed for ${cameraName}:`, e);
369
+ break;
370
+ }
371
+ }
316
372
 
317
- analysis.isValid = true;
318
- this.console.log(`[Discovery] Analyzed ${cameraName}: ${analysis.landmarks.length} landmarks, ${analysis.zones.length} zones`);
319
- } catch (parseError) {
320
- this.console.warn(`[Discovery] Failed to parse LLM response for ${cameraName}:`, parseError);
321
- analysis.error = 'Failed to parse LLM response';
322
- }
373
+ // All formats failed
374
+ if (lastError) {
375
+ const errorStr = String(lastError);
376
+ if (isVisionFormatError(lastError)) {
377
+ analysis.error = 'Vision/image analysis failed with all formats. Ensure you have a vision-capable model (e.g., gpt-4o, gpt-4-turbo, claude-3-sonnet) configured and the @scrypted/llm plugin supports vision.';
378
+ } else {
379
+ analysis.error = `Analysis failed: ${errorStr}`;
323
380
  }
324
- } catch (e) {
325
- this.console.warn(`[Discovery] Scene analysis failed for ${cameraName}:`, e);
326
- analysis.error = `Analysis failed: ${e}`;
327
381
  }
328
382
 
329
383
  // Cache the analysis