@blueharford/scrypted-spatial-awareness 0.4.7 → 0.5.0-beta

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -9,6 +9,8 @@ import sdk, {
9
9
  ObjectDetection,
10
10
  Camera,
11
11
  MediaObject,
12
+ ScryptedDevice,
13
+ ScryptedMimeTypes,
12
14
  } from '@scrypted/sdk';
13
15
  import {
14
16
  CameraTopology,
@@ -25,7 +27,7 @@ import {
25
27
  } from '../models/topology';
26
28
  import { TrackedObject, ObjectSighting } from '../models/tracked-object';
27
29
 
28
- const { systemManager } = sdk;
30
+ const { systemManager, mediaManager } = sdk;
29
31
 
30
32
  /** Configuration for the spatial reasoning engine */
31
33
  export interface SpatialReasoningConfig {
@@ -61,11 +63,40 @@ interface ContextChunk {
61
63
  metadata: Record<string, any>;
62
64
  }
63
65
 
66
+ /** Interface for ChatCompletion devices (from @scrypted/llm plugin) */
67
+ interface ChatCompletionDevice extends ScryptedDevice {
68
+ getChatCompletion?(params: any): Promise<any>;
69
+ streamChatCompletion?(params: any): AsyncGenerator<any>;
70
+ }
71
+
72
+ /**
73
+ * Convert a MediaObject to a base64 data URL for vision LLM consumption
74
+ * @param mediaObject - MediaObject from camera.takePicture()
75
+ * @returns Base64 data URL (data:image/jpeg;base64,...) or null if conversion fails
76
+ */
77
+ export async function mediaObjectToBase64(mediaObject: MediaObject): Promise<string | null> {
78
+ try {
79
+ // Convert MediaObject to Buffer using mediaManager
80
+ const buffer = await mediaManager.convertMediaObjectToBuffer(mediaObject, ScryptedMimeTypes.Image);
81
+
82
+ // Convert buffer to base64
83
+ const base64 = buffer.toString('base64');
84
+
85
+ // Determine MIME type - default to JPEG for camera images
86
+ const mimeType = mediaObject.mimeType?.split(';')[0] || 'image/jpeg';
87
+
88
+ return `data:${mimeType};base64,${base64}`;
89
+ } catch (e) {
90
+ console.warn('Failed to convert MediaObject to base64:', e);
91
+ return null;
92
+ }
93
+ }
94
+
64
95
  export class SpatialReasoningEngine {
65
96
  private config: SpatialReasoningConfig;
66
97
  private console: Console;
67
98
  private topology: CameraTopology | null = null;
68
- private llmDevice: ObjectDetection | null = null;
99
+ private llmDevice: ChatCompletionDevice | null = null;
69
100
  private contextChunks: ContextChunk[] = [];
70
101
  private topologyContextCache: string | null = null;
71
102
  private contextCacheTime: number = 0;
@@ -303,30 +334,213 @@ export class SpatialReasoningEngine {
303
334
  return relevant;
304
335
  }
305
336
 
306
- /** Find or initialize LLM device */
307
- private async findLlmDevice(): Promise<ObjectDetection | null> {
337
+ private llmSearched: boolean = false;
338
+ private llmProvider: string | null = null;
339
+
340
+ /** Find or initialize LLM device - looks for ChatCompletion interface from @scrypted/llm plugin */
341
+ private async findLlmDevice(): Promise<ChatCompletionDevice | null> {
308
342
  if (this.llmDevice) return this.llmDevice;
343
+ if (this.llmSearched) return null; // Already searched and found nothing
344
+
345
+ this.llmSearched = true;
309
346
 
310
347
  try {
348
+ // Look for devices with ChatCompletion interface (the correct interface for @scrypted/llm)
311
349
  for (const id of Object.keys(systemManager.getSystemState())) {
312
350
  const device = systemManager.getDeviceById(id);
313
- if (device?.interfaces?.includes(ScryptedInterface.ObjectDetection)) {
314
- const name = device.name?.toLowerCase() || '';
315
- if (name.includes('llm') || name.includes('gpt') || name.includes('claude') ||
316
- name.includes('ollama') || name.includes('gemini')) {
317
- this.llmDevice = device as unknown as ObjectDetection;
318
- this.console.log(`Found LLM device: ${device.name}`);
319
- return this.llmDevice;
351
+ if (!device) continue;
352
+
353
+ // Check if this device has ChatCompletion interface
354
+ // The @scrypted/llm plugin exposes ChatCompletion, not ObjectDetection
355
+ if (device.interfaces?.includes('ChatCompletion')) {
356
+ const deviceName = device.name?.toLowerCase() || '';
357
+ const pluginId = (device as any).pluginId?.toLowerCase() || '';
358
+
359
+ // Identify the provider type for logging
360
+ let providerType = 'Unknown';
361
+ if (pluginId.includes('@scrypted/llm') || pluginId.includes('llm')) {
362
+ providerType = 'Scrypted LLM';
363
+ }
364
+ if (deviceName.includes('openai') || deviceName.includes('gpt')) {
365
+ providerType = 'OpenAI';
366
+ } else if (deviceName.includes('anthropic') || deviceName.includes('claude')) {
367
+ providerType = 'Anthropic';
368
+ } else if (deviceName.includes('ollama')) {
369
+ providerType = 'Ollama';
370
+ } else if (deviceName.includes('gemini') || deviceName.includes('google')) {
371
+ providerType = 'Google';
372
+ } else if (deviceName.includes('llama')) {
373
+ providerType = 'llama.cpp';
320
374
  }
375
+
376
+ this.llmDevice = device as unknown as ChatCompletionDevice;
377
+ this.llmProvider = `${providerType} (${device.name})`;
378
+ this.console.log(`[LLM] Connected to ${providerType}: ${device.name}`);
379
+ this.console.log(`[LLM] Plugin: ${pluginId || 'N/A'}`);
380
+ this.console.log(`[LLM] Interfaces: ${device.interfaces?.join(', ')}`);
381
+ return this.llmDevice;
321
382
  }
322
383
  }
384
+
385
+ // If we get here, no LLM plugin found
386
+ this.console.warn('[LLM] No ChatCompletion device found. Install @scrypted/llm for enhanced descriptions.');
387
+ this.console.warn('[LLM] Falling back to rule-based descriptions using topology data.');
388
+
323
389
  } catch (e) {
324
- this.console.warn('Error finding LLM device:', e);
390
+ this.console.error('[LLM] Error searching for LLM device:', e);
325
391
  }
326
392
 
327
393
  return null;
328
394
  }
329
395
 
396
+ /** Get the current LLM provider name */
397
+ getLlmProvider(): string | null {
398
+ return this.llmProvider;
399
+ }
400
+
401
+ /** Check if LLM is available */
402
+ isLlmAvailable(): boolean {
403
+ return this.llmDevice !== null;
404
+ }
405
+
406
+ /** Generate entry description when object enters property */
407
+ generateEntryDescription(
408
+ tracked: TrackedObject,
409
+ cameraId: string
410
+ ): SpatialReasoningResult {
411
+ if (!this.topology) {
412
+ return {
413
+ description: `${this.capitalizeFirst(tracked.className)} entered property`,
414
+ involvedLandmarks: [],
415
+ confidence: 0.5,
416
+ usedLlm: false,
417
+ };
418
+ }
419
+
420
+ const camera = findCamera(this.topology, cameraId);
421
+ if (!camera) {
422
+ return {
423
+ description: `${this.capitalizeFirst(tracked.className)} entered property`,
424
+ involvedLandmarks: [],
425
+ confidence: 0.5,
426
+ usedLlm: false,
427
+ };
428
+ }
429
+
430
+ const landmarks = getLandmarksVisibleFromCamera(this.topology, cameraId);
431
+ const objectType = this.capitalizeFirst(tracked.className);
432
+
433
+ // Build entry description using topology context
434
+ const location = this.describeLocation(camera, landmarks, 'to');
435
+
436
+ // Check if we can determine where they came from (e.g., street, neighbor)
437
+ const entryLandmark = landmarks.find(l => l.isEntryPoint);
438
+ const streetLandmark = landmarks.find(l => l.type === 'street');
439
+ const neighborLandmark = landmarks.find(l => l.type === 'neighbor');
440
+
441
+ let source = '';
442
+ if (streetLandmark) {
443
+ source = ` from ${streetLandmark.name}`;
444
+ } else if (neighborLandmark) {
445
+ source = ` from ${neighborLandmark.name}`;
446
+ }
447
+
448
+ return {
449
+ description: `${objectType} arrived at ${location}${source}`,
450
+ involvedLandmarks: landmarks,
451
+ confidence: 0.8,
452
+ usedLlm: false,
453
+ };
454
+ }
455
+
456
+ /** Generate exit description when object leaves property */
457
+ generateExitDescription(
458
+ tracked: TrackedObject,
459
+ cameraId: string
460
+ ): SpatialReasoningResult {
461
+ if (!this.topology) {
462
+ return {
463
+ description: `${this.capitalizeFirst(tracked.className)} left property`,
464
+ involvedLandmarks: [],
465
+ confidence: 0.5,
466
+ usedLlm: false,
467
+ };
468
+ }
469
+
470
+ const camera = findCamera(this.topology, cameraId);
471
+ if (!camera) {
472
+ return {
473
+ description: `${this.capitalizeFirst(tracked.className)} left property`,
474
+ involvedLandmarks: [],
475
+ confidence: 0.5,
476
+ usedLlm: false,
477
+ };
478
+ }
479
+
480
+ const landmarks = getLandmarksVisibleFromCamera(this.topology, cameraId);
481
+ const objectType = this.capitalizeFirst(tracked.className);
482
+
483
+ // Build exit description
484
+ const location = this.describeLocation(camera, landmarks, 'from');
485
+
486
+ // Check for exit point landmarks
487
+ const exitLandmark = landmarks.find(l => l.isExitPoint);
488
+ const streetLandmark = landmarks.find(l => l.type === 'street');
489
+
490
+ let destination = '';
491
+ if (streetLandmark) {
492
+ destination = ` towards ${streetLandmark.name}`;
493
+ } else if (exitLandmark) {
494
+ destination = ` via ${exitLandmark.name}`;
495
+ }
496
+
497
+ // Include time on property if available
498
+ const dwellTime = Math.round((tracked.lastSeen - tracked.firstSeen) / 1000);
499
+ let timeContext = '';
500
+ if (dwellTime > 60) {
501
+ timeContext = ` after ${Math.round(dwellTime / 60)}m on property`;
502
+ } else if (dwellTime > 10) {
503
+ timeContext = ` after ${dwellTime}s`;
504
+ }
505
+
506
+ // Summarize journey if they visited multiple cameras (use landmarks from topology)
507
+ let journeyContext = '';
508
+ if (tracked.journey.length > 0 && this.topology) {
509
+ const visitedLandmarks: string[] = [];
510
+
511
+ // Get landmarks from entry camera
512
+ if (tracked.entryCamera) {
513
+ const entryLandmarks = getLandmarksVisibleFromCamera(this.topology, tracked.entryCamera);
514
+ const entryLandmark = entryLandmarks.find(l => l.isEntryPoint || l.type === 'access') || entryLandmarks[0];
515
+ if (entryLandmark) {
516
+ visitedLandmarks.push(entryLandmark.name);
517
+ }
518
+ }
519
+
520
+ // Get landmarks from journey segments
521
+ for (const segment of tracked.journey) {
522
+ const segmentLandmarks = getLandmarksVisibleFromCamera(this.topology, segment.toCameraId);
523
+ const segmentLandmark = segmentLandmarks.find(l =>
524
+ !visitedLandmarks.includes(l.name) && (l.type === 'access' || l.type === 'zone' || l.type === 'structure')
525
+ );
526
+ if (segmentLandmark && !visitedLandmarks.includes(segmentLandmark.name)) {
527
+ visitedLandmarks.push(segmentLandmark.name);
528
+ }
529
+ }
530
+
531
+ if (visitedLandmarks.length > 1) {
532
+ journeyContext = ` — visited ${visitedLandmarks.join(' → ')}`;
533
+ }
534
+ }
535
+
536
+ return {
537
+ description: `${objectType} left ${location}${destination}${timeContext}${journeyContext}`,
538
+ involvedLandmarks: landmarks,
539
+ confidence: 0.8,
540
+ usedLlm: false,
541
+ };
542
+ }
543
+
330
544
  /** Generate rich movement description using LLM */
331
545
  async generateMovementDescription(
332
546
  tracked: TrackedObject,
@@ -415,28 +629,92 @@ export class SpatialReasoningEngine {
415
629
  const objectType = this.capitalizeFirst(tracked.className);
416
630
  const transitSecs = Math.round(transitTime / 1000);
417
631
 
418
- // Build origin description
419
- let origin = fromCamera.name;
420
- if (fromLandmarks.length > 0) {
421
- const nearLandmark = fromLandmarks[0];
422
- origin = `near ${nearLandmark.name}`;
423
- } else if (fromCamera.context?.coverageDescription) {
424
- origin = fromCamera.context.coverageDescription.split('.')[0];
425
- }
632
+ // Get connection for path context
633
+ const connection = this.topology ? findConnection(this.topology, fromCamera.deviceId, toCamera.deviceId) : null;
634
+
635
+ // Build origin description using landmarks, camera context, or camera name
636
+ let origin = this.describeLocation(fromCamera, fromLandmarks, 'from');
426
637
 
427
638
  // Build destination description
428
- let destination = toCamera.name;
429
- if (toLandmarks.length > 0) {
430
- const nearLandmark = toLandmarks[0];
431
- destination = `towards ${nearLandmark.name}`;
432
- } else if (toCamera.context?.coverageDescription) {
433
- destination = `towards ${toCamera.context.coverageDescription.split('.')[0]}`;
639
+ let destination = this.describeLocation(toCamera, toLandmarks, 'to');
640
+
641
+ // Check if we have a named path/connection
642
+ let pathContext = '';
643
+ if (connection?.name) {
644
+ pathContext = ` via ${connection.name}`;
645
+ } else if (connection?.pathLandmarks?.length && this.topology) {
646
+ const pathNames = connection.pathLandmarks
647
+ .map(id => findLandmark(this.topology!, id)?.name)
648
+ .filter(Boolean);
649
+ if (pathNames.length > 0) {
650
+ pathContext = ` past ${pathNames.join(' and ')}`;
651
+ }
434
652
  }
435
653
 
436
- // Build transit string
437
- const transitStr = transitSecs > 0 ? ` (${transitSecs}s)` : '';
654
+ // Include journey context if this is not the first camera
655
+ let journeyContext = '';
656
+ if (tracked.journey.length > 0) {
657
+ const totalTime = Math.round((Date.now() - tracked.firstSeen) / 1000);
658
+ if (totalTime > 60) {
659
+ journeyContext = ` (${Math.round(totalTime / 60)}m on property)`;
660
+ }
661
+ }
662
+
663
+ // Determine movement verb based on transit time and object type
664
+ const verb = this.getMovementVerb(tracked.className, transitSecs);
665
+
666
+ return `${objectType} ${verb} ${origin} heading ${destination}${pathContext}${journeyContext}`;
667
+ }
668
+
669
+ /** Describe a location using landmarks, camera context, or camera name */
670
+ private describeLocation(camera: CameraNode, landmarks: Landmark[], direction: 'from' | 'to'): string {
671
+ // Priority 1: Use entry/exit landmarks
672
+ const entryExitLandmark = landmarks.find(l =>
673
+ (direction === 'from' && l.isExitPoint) || (direction === 'to' && l.isEntryPoint)
674
+ );
675
+ if (entryExitLandmark) {
676
+ return direction === 'from' ? `the ${entryExitLandmark.name}` : `the ${entryExitLandmark.name}`;
677
+ }
678
+
679
+ // Priority 2: Use access landmarks (driveway, walkway, etc.)
680
+ const accessLandmark = landmarks.find(l => l.type === 'access');
681
+ if (accessLandmark) {
682
+ return `the ${accessLandmark.name}`;
683
+ }
684
+
685
+ // Priority 3: Use zone landmarks (front yard, back yard)
686
+ const zoneLandmark = landmarks.find(l => l.type === 'zone');
687
+ if (zoneLandmark) {
688
+ return `the ${zoneLandmark.name}`;
689
+ }
690
+
691
+ // Priority 4: Use any landmark
692
+ if (landmarks.length > 0) {
693
+ return `near ${landmarks[0].name}`;
694
+ }
695
+
696
+ // Priority 5: Use camera coverage description
697
+ if (camera.context?.coverageDescription) {
698
+ const desc = camera.context.coverageDescription.split('.')[0].toLowerCase();
699
+ return `the ${desc}`;
700
+ }
701
+
702
+ // Fallback: Generic description (no camera name inference - use topology for context)
703
+ return direction === 'from' ? 'property' : 'property';
704
+ }
438
705
 
439
- return `${objectType} moving from ${origin} ${destination}${transitStr}`;
706
+ /** Get appropriate movement verb based on context */
707
+ private getMovementVerb(className: string, transitSecs: number): string {
708
+ if (className === 'car' || className === 'vehicle' || className === 'truck') {
709
+ return transitSecs < 10 ? 'driving from' : 'moved from';
710
+ }
711
+ if (transitSecs < 5) {
712
+ return 'walking from';
713
+ }
714
+ if (transitSecs < 30) {
715
+ return 'moved from';
716
+ }
717
+ return 'traveled from';
440
718
  }
441
719
 
442
720
  /** Build path description from connection */
@@ -458,7 +736,7 @@ export class SpatialReasoningEngine {
458
736
  return connection.name || undefined;
459
737
  }
460
738
 
461
- /** Get LLM-enhanced description */
739
+ /** Get LLM-enhanced description using ChatCompletion interface with vision support */
462
740
  private async getLlmEnhancedDescription(
463
741
  tracked: TrackedObject,
464
742
  fromCamera: CameraNode,
@@ -469,9 +747,12 @@ export class SpatialReasoningEngine {
469
747
  mediaObject: MediaObject
470
748
  ): Promise<string | null> {
471
749
  const llm = await this.findLlmDevice();
472
- if (!llm) return null;
750
+ if (!llm || !llm.getChatCompletion) return null;
473
751
 
474
752
  try {
753
+ // Convert image to base64 for vision LLM
754
+ const imageBase64 = await mediaObjectToBase64(mediaObject);
755
+
475
756
  // Retrieve relevant context for RAG
476
757
  const relevantChunks = this.retrieveRelevantContext(
477
758
  fromCamera.deviceId,
@@ -492,14 +773,35 @@ export class SpatialReasoningEngine {
492
773
  ragContext
493
774
  );
494
775
 
495
- // Call LLM
496
- const result = await llm.detectObjects(mediaObject, {
497
- settings: { prompt }
498
- } as any);
776
+ // Build message content - use multimodal format if we have an image
777
+ let messageContent: any;
778
+ if (imageBase64) {
779
+ // Vision-capable multimodal message format (OpenAI compatible)
780
+ messageContent = [
781
+ { type: 'text', text: prompt },
782
+ { type: 'image_url', image_url: { url: imageBase64 } },
783
+ ];
784
+ } else {
785
+ // Fallback to text-only if image conversion failed
786
+ messageContent = prompt;
787
+ }
788
+
789
+ // Call LLM using ChatCompletion interface
790
+ const result = await llm.getChatCompletion({
791
+ messages: [
792
+ {
793
+ role: 'user',
794
+ content: messageContent,
795
+ },
796
+ ],
797
+ max_tokens: 150,
798
+ temperature: 0.7,
799
+ });
499
800
 
500
- // Extract description from result
501
- if (result.detections?.[0]?.label) {
502
- return result.detections[0].label;
801
+ // Extract description from ChatCompletion result
802
+ const content = result?.choices?.[0]?.message?.content;
803
+ if (content && typeof content === 'string') {
804
+ return content.trim();
503
805
  }
504
806
 
505
807
  return null;
@@ -547,7 +849,7 @@ Examples of good descriptions:
547
849
  Generate ONLY the description, nothing else:`;
548
850
  }
549
851
 
550
- /** Suggest a new landmark based on AI analysis */
852
+ /** Suggest a new landmark based on AI analysis using ChatCompletion with vision */
551
853
  async suggestLandmark(
552
854
  cameraId: string,
553
855
  mediaObject: MediaObject,
@@ -557,9 +859,12 @@ Generate ONLY the description, nothing else:`;
557
859
  if (!this.config.enableLandmarkLearning) return null;
558
860
 
559
861
  const llm = await this.findLlmDevice();
560
- if (!llm) return null;
862
+ if (!llm || !llm.getChatCompletion) return null;
561
863
 
562
864
  try {
865
+ // Convert image to base64 for vision LLM
866
+ const imageBase64 = await mediaObjectToBase64(mediaObject);
867
+
563
868
  const prompt = `Analyze this security camera image. A ${objectClass} was detected.
564
869
 
565
870
  Looking at the surroundings and environment, identify any notable landmarks or features visible that could help describe this location. Consider:
@@ -573,13 +878,35 @@ If you can identify a clear landmark feature, respond with ONLY a JSON object:
573
878
 
574
879
  If no clear landmark is identifiable, respond with: {"name": null}`;
575
880
 
576
- const result = await llm.detectObjects(mediaObject, {
577
- settings: { prompt }
578
- } as any);
881
+ // Build message content - use multimodal format if we have an image
882
+ let messageContent: any;
883
+ if (imageBase64) {
884
+ // Vision-capable multimodal message format (OpenAI compatible)
885
+ messageContent = [
886
+ { type: 'text', text: prompt },
887
+ { type: 'image_url', image_url: { url: imageBase64 } },
888
+ ];
889
+ } else {
890
+ // Fallback to text-only if image conversion failed
891
+ messageContent = prompt;
892
+ }
893
+
894
+ // Call LLM using ChatCompletion interface
895
+ const result = await llm.getChatCompletion({
896
+ messages: [
897
+ {
898
+ role: 'user',
899
+ content: messageContent,
900
+ },
901
+ ],
902
+ max_tokens: 100,
903
+ temperature: 0.3,
904
+ });
579
905
 
580
- if (result.detections?.[0]?.label) {
906
+ const content = result?.choices?.[0]?.message?.content;
907
+ if (content && typeof content === 'string') {
581
908
  try {
582
- const parsed = JSON.parse(result.detections[0].label);
909
+ const parsed = JSON.parse(content.trim());
583
910
  if (parsed.name && parsed.type) {
584
911
  const suggestionId = `suggest_${Date.now()}_${Math.random().toString(36).slice(2, 8)}`;
585
912