@blueharford/scrypted-spatial-awareness 0.6.7 → 0.6.9

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/out/plugin.zip CHANGED
Binary file
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@blueharford/scrypted-spatial-awareness",
3
- "version": "0.6.7",
3
+ "version": "0.6.9",
4
4
  "description": "Cross-camera object tracking for Scrypted NVR with spatial awareness",
5
5
  "author": "Joshua Seidel <blueharford>",
6
6
  "license": "Apache-2.0",
@@ -545,10 +545,11 @@ export class SpatialReasoningEngine {
545
545
  }
546
546
 
547
547
  /** Generate entry description when object enters property */
548
- generateEntryDescription(
548
+ async generateEntryDescription(
549
549
  tracked: TrackedObject,
550
- cameraId: string
551
- ): SpatialReasoningResult {
550
+ cameraId: string,
551
+ mediaObject?: MediaObject
552
+ ): Promise<SpatialReasoningResult> {
552
553
  if (!this.topology) {
553
554
  return {
554
555
  description: `${this.capitalizeFirst(tracked.className)} entered property`,
@@ -571,11 +572,10 @@ export class SpatialReasoningEngine {
571
572
  const landmarks = getLandmarksVisibleFromCamera(this.topology, cameraId);
572
573
  const objectType = this.capitalizeFirst(tracked.className);
573
574
 
574
- // Build entry description using topology context
575
+ // Build basic entry description using topology context
575
576
  const location = this.describeLocation(camera, landmarks, 'to');
576
577
 
577
578
  // Check if we can determine where they came from (e.g., street, neighbor)
578
- const entryLandmark = landmarks.find(l => l.isEntryPoint);
579
579
  const streetLandmark = landmarks.find(l => l.type === 'street');
580
580
  const neighborLandmark = landmarks.find(l => l.type === 'neighbor');
581
581
 
@@ -586,8 +586,31 @@ export class SpatialReasoningEngine {
586
586
  source = ` from ${neighborLandmark.name}`;
587
587
  }
588
588
 
589
+ const basicDescription = `${objectType} arrived at ${location}${source}`;
590
+
591
+ // Try LLM for enhanced description with visual details
592
+ this.console.log(`[Entry] enableLlm=${this.config.enableLlm}, hasMediaObject=${!!mediaObject}`);
593
+ if (this.config.enableLlm && mediaObject) {
594
+ this.console.log(`[Entry] Attempting LLM description for entry event`);
595
+ const llmDescription = await this.getLlmEntryExitDescription(
596
+ tracked, camera, landmarks, 'entry', mediaObject
597
+ );
598
+ if (llmDescription) {
599
+ this.console.log(`[Entry] LLM returned: ${llmDescription.substring(0, 50)}...`);
600
+ return {
601
+ description: llmDescription,
602
+ involvedLandmarks: landmarks,
603
+ confidence: 0.9,
604
+ usedLlm: true,
605
+ };
606
+ }
607
+ this.console.warn(`[Entry] LLM returned null, falling back to basic`);
608
+ } else {
609
+ this.console.log(`[Entry] Skipping LLM (enableLlm=${this.config.enableLlm}, mediaObject=${!!mediaObject})`);
610
+ }
611
+
589
612
  return {
590
- description: `${objectType} arrived at ${location}${source}`,
613
+ description: basicDescription,
591
614
  involvedLandmarks: landmarks,
592
615
  confidence: 0.8,
593
616
  usedLlm: false,
@@ -595,10 +618,11 @@ export class SpatialReasoningEngine {
595
618
  }
596
619
 
597
620
  /** Generate exit description when object leaves property */
598
- generateExitDescription(
621
+ async generateExitDescription(
599
622
  tracked: TrackedObject,
600
- cameraId: string
601
- ): SpatialReasoningResult {
623
+ cameraId: string,
624
+ mediaObject?: MediaObject
625
+ ): Promise<SpatialReasoningResult> {
602
626
  if (!this.topology) {
603
627
  return {
604
628
  description: `${this.capitalizeFirst(tracked.className)} left property`,
@@ -621,7 +645,7 @@ export class SpatialReasoningEngine {
621
645
  const landmarks = getLandmarksVisibleFromCamera(this.topology, cameraId);
622
646
  const objectType = this.capitalizeFirst(tracked.className);
623
647
 
624
- // Build exit description
648
+ // Build basic exit description
625
649
  const location = this.describeLocation(camera, landmarks, 'from');
626
650
 
627
651
  // Check for exit point landmarks
@@ -674,8 +698,31 @@ export class SpatialReasoningEngine {
674
698
  }
675
699
  }
676
700
 
701
+ const basicDescription = `${objectType} left ${location}${destination}${timeContext}${journeyContext}`;
702
+
703
+ // Try LLM for enhanced description with visual details
704
+ this.console.log(`[Exit] enableLlm=${this.config.enableLlm}, hasMediaObject=${!!mediaObject}`);
705
+ if (this.config.enableLlm && mediaObject) {
706
+ this.console.log(`[Exit] Attempting LLM description for exit event`);
707
+ const llmDescription = await this.getLlmEntryExitDescription(
708
+ tracked, camera, landmarks, 'exit', mediaObject, journeyContext
709
+ );
710
+ if (llmDescription) {
711
+ this.console.log(`[Exit] LLM returned: ${llmDescription.substring(0, 50)}...`);
712
+ return {
713
+ description: llmDescription,
714
+ involvedLandmarks: landmarks,
715
+ confidence: 0.9,
716
+ usedLlm: true,
717
+ };
718
+ }
719
+ this.console.warn(`[Exit] LLM returned null, falling back to basic`);
720
+ } else {
721
+ this.console.log(`[Exit] Skipping LLM (enableLlm=${this.config.enableLlm}, mediaObject=${!!mediaObject})`);
722
+ }
723
+
677
724
  return {
678
- description: `${objectType} left ${location}${destination}${timeContext}${journeyContext}`,
725
+ description: basicDescription,
679
726
  involvedLandmarks: landmarks,
680
727
  confidence: 0.8,
681
728
  usedLlm: false,
@@ -952,6 +999,122 @@ export class SpatialReasoningEngine {
952
999
  }
953
1000
  }
954
1001
 
1002
+ /** Get LLM-enhanced description for entry/exit events */
1003
+ private async getLlmEntryExitDescription(
1004
+ tracked: TrackedObject,
1005
+ camera: CameraNode,
1006
+ landmarks: Landmark[],
1007
+ eventType: 'entry' | 'exit',
1008
+ mediaObject: MediaObject,
1009
+ journeyContext?: string
1010
+ ): Promise<string | null> {
1011
+ this.console.log(`[LLM] getLlmEntryExitDescription called for ${eventType} event`);
1012
+
1013
+ const llm = await this.findLlmDevice();
1014
+ if (!llm) {
1015
+ this.console.warn(`[LLM] No LLM device found for ${eventType} description`);
1016
+ return null;
1017
+ }
1018
+ if (!llm.getChatCompletion) {
1019
+ this.console.warn(`[LLM] LLM device has no getChatCompletion method`);
1020
+ return null;
1021
+ }
1022
+
1023
+ this.console.log(`[LLM] Using LLM device: ${this.llmProvider}`);
1024
+
1025
+ try {
1026
+ // Convert image to base64 for vision LLM
1027
+ const imageData = await mediaObjectToBase64(mediaObject);
1028
+ this.console.log(`[LLM] Image converted: ${imageData ? 'success' : 'failed'}, type: ${imageData?.mediaType}`);
1029
+
1030
+ const landmarkNames = landmarks.map(l => l.name).join(', ') || 'none identified';
1031
+ const dwellTime = Math.round((tracked.lastSeen - tracked.firstSeen) / 1000);
1032
+
1033
+ // Build context-aware prompt
1034
+ const prompt = eventType === 'entry'
1035
+ ? `You are a security camera system. Analyze this image and describe who/what just arrived.
1036
+
1037
+ CONTEXT:
1038
+ - Camera: ${camera.name}
1039
+ - Object type: ${tracked.className}
1040
+ - Nearby landmarks: ${landmarkNames}
1041
+
1042
+ INSTRUCTIONS:
1043
+ Look at the image and generate a single, natural sentence describing:
1044
+ 1. Physical description (if person: gender, clothing, items carried; if vehicle: color, type, make)
1045
+ 2. What they appear to be doing (arriving, approaching, etc.)
1046
+ 3. Relevant landmark context (driveway, front door, mailbox, etc.)
1047
+
1048
+ Examples of good descriptions:
1049
+ - "Man in gray hoodie approaching the front door"
1050
+ - "Woman in scrubs arriving with shopping bags"
1051
+ - "White delivery van pulling into the driveway"
1052
+ - "UPS driver carrying package towards the porch"
1053
+ - "Teenager on bicycle coming up the driveway"
1054
+
1055
+ Generate ONLY the description, nothing else:`
1056
+ : `You are a security camera system. Analyze this image and describe who/what is leaving.
1057
+
1058
+ CONTEXT:
1059
+ - Camera: ${camera.name}
1060
+ - Object type: ${tracked.className}
1061
+ - Time on property: ${dwellTime > 60 ? Math.round(dwellTime / 60) + ' minutes' : dwellTime + ' seconds'}
1062
+ - Nearby landmarks: ${landmarkNames}
1063
+ ${journeyContext ? `- Journey: ${journeyContext}` : ''}
1064
+
1065
+ INSTRUCTIONS:
1066
+ Look at the image and generate a single, natural sentence describing:
1067
+ 1. Physical description (if person: gender, clothing, items carried; if vehicle: color, type)
1068
+ 2. What they did (if determinable from context)
1069
+ 3. Direction they're leaving towards
1070
+
1071
+ Examples of good descriptions:
1072
+ - "Man in black hoodie leaving after checking the mailbox"
1073
+ - "Woman in business attire heading to car in driveway"
1074
+ - "Red sedan backing out of the driveway"
1075
+ - "Delivery driver returning to FedEx truck after leaving package"
1076
+ - "Landscaper with leaf blower heading to work truck"
1077
+
1078
+ Generate ONLY the description, nothing else:`;
1079
+
1080
+ // Build message content - use multimodal format if we have an image
1081
+ let messageContent: any;
1082
+ if (imageData) {
1083
+ messageContent = [
1084
+ { type: 'text', text: prompt },
1085
+ buildImageContent(imageData, this.llmProviderType),
1086
+ ];
1087
+ } else {
1088
+ messageContent = prompt;
1089
+ }
1090
+
1091
+ // Call LLM using ChatCompletion interface
1092
+ this.console.log(`[LLM] Calling getChatCompletion for ${eventType}...`);
1093
+ const result = await llm.getChatCompletion({
1094
+ messages: [
1095
+ {
1096
+ role: 'user',
1097
+ content: messageContent,
1098
+ },
1099
+ ],
1100
+ max_tokens: 100,
1101
+ temperature: 0.7,
1102
+ });
1103
+
1104
+ const content = result?.choices?.[0]?.message?.content;
1105
+ if (content && typeof content === 'string') {
1106
+ this.console.log(`[LLM] Got ${eventType} description: ${content.trim().substring(0, 50)}...`);
1107
+ return content.trim();
1108
+ }
1109
+
1110
+ this.console.warn(`[LLM] No content in response for ${eventType}`);
1111
+ return null;
1112
+ } catch (e) {
1113
+ this.console.warn(`[LLM] ${eventType} description generation failed:`, e);
1114
+ return null;
1115
+ }
1116
+ }
1117
+
955
1118
  /** Build LLM prompt with RAG context */
956
1119
  private buildLlmPrompt(
957
1120
  tracked: TrackedObject,
@@ -536,11 +536,30 @@ export class TrackingEngine {
536
536
  // Check if we've already alerted for this object
537
537
  if (this.isInAlertCooldown(globalId)) return;
538
538
 
539
- // Generate spatial description
540
- const spatialResult = this.spatialReasoning.generateEntryDescription(
539
+ // Get snapshot for LLM description (if LLM is enabled)
540
+ let mediaObject: MediaObject | undefined;
541
+ this.console.log(`[Entry Alert] useLlmDescriptions=${this.config.useLlmDescriptions}`);
542
+ if (this.config.useLlmDescriptions) {
543
+ try {
544
+ const camera = systemManager.getDeviceById<Camera>(sighting.cameraId);
545
+ this.console.log(`[Entry Alert] Camera ${sighting.cameraId} has Camera interface: ${camera?.interfaces?.includes(ScryptedInterface.Camera)}`);
546
+ if (camera?.interfaces?.includes(ScryptedInterface.Camera)) {
547
+ mediaObject = await camera.takePicture();
548
+ this.console.log(`[Entry Alert] Got snapshot: ${!!mediaObject}`);
549
+ }
550
+ } catch (e) {
551
+ this.console.warn('[Entry Alert] Failed to get snapshot:', e);
552
+ }
553
+ }
554
+
555
+ // Generate spatial description (now async with LLM support)
556
+ this.console.log(`[Entry Alert] Calling generateEntryDescription with mediaObject=${!!mediaObject}`);
557
+ const spatialResult = await this.spatialReasoning.generateEntryDescription(
541
558
  tracked,
542
- sighting.cameraId
559
+ sighting.cameraId,
560
+ mediaObject
543
561
  );
562
+ this.console.log(`[Entry Alert] Got description: "${spatialResult.description.substring(0, 60)}...", usedLlm=${spatialResult.usedLlm}`);
544
563
 
545
564
  if (isEntryPoint) {
546
565
  // Entry point - generate property entry alert
@@ -626,14 +645,32 @@ export class TrackingEngine {
626
645
  if (current && current.state === 'pending') {
627
646
  this.state.markExited(tracked.globalId, sighting.cameraId, sighting.cameraName);
628
647
 
629
- // Generate rich exit description using topology context
630
- const spatialResult = this.spatialReasoning.generateExitDescription(
648
+ // Get snapshot for LLM description (if LLM is enabled)
649
+ let mediaObject: MediaObject | undefined;
650
+ this.console.log(`[Exit Alert] useLlmDescriptions=${this.config.useLlmDescriptions}`);
651
+ if (this.config.useLlmDescriptions) {
652
+ try {
653
+ const camera = systemManager.getDeviceById<Camera>(sighting.cameraId);
654
+ this.console.log(`[Exit Alert] Camera ${sighting.cameraId} has Camera interface: ${camera?.interfaces?.includes(ScryptedInterface.Camera)}`);
655
+ if (camera?.interfaces?.includes(ScryptedInterface.Camera)) {
656
+ mediaObject = await camera.takePicture();
657
+ this.console.log(`[Exit Alert] Got snapshot: ${!!mediaObject}`);
658
+ }
659
+ } catch (e) {
660
+ this.console.warn('[Exit Alert] Failed to get snapshot:', e);
661
+ }
662
+ }
663
+
664
+ // Generate rich exit description using topology context (now async with LLM support)
665
+ this.console.log(`[Exit Alert] Calling generateExitDescription with mediaObject=${!!mediaObject}`);
666
+ const spatialResult = await this.spatialReasoning.generateExitDescription(
631
667
  current,
632
- sighting.cameraId
668
+ sighting.cameraId,
669
+ mediaObject
633
670
  );
634
671
 
635
672
  this.console.log(
636
- `Object ${tracked.globalId.slice(0, 8)} exited: ${spatialResult.description}`
673
+ `[Exit Alert] Object ${tracked.globalId.slice(0, 8)} exited: "${spatialResult.description.substring(0, 60)}...", usedLlm=${spatialResult.usedLlm}`
637
674
  );
638
675
 
639
676
  await this.alertManager.checkAndAlert('property_exit', current, {