@blueharford/scrypted-spatial-awareness 0.6.6 → 0.6.8

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/out/plugin.zip CHANGED
Binary file
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@blueharford/scrypted-spatial-awareness",
3
- "version": "0.6.6",
3
+ "version": "0.6.8",
4
4
  "description": "Cross-camera object tracking for Scrypted NVR with spatial awareness",
5
5
  "author": "Joshua Seidel <blueharford>",
6
6
  "license": "Apache-2.0",
@@ -545,10 +545,11 @@ export class SpatialReasoningEngine {
545
545
  }
546
546
 
547
547
  /** Generate entry description when object enters property */
548
- generateEntryDescription(
548
+ async generateEntryDescription(
549
549
  tracked: TrackedObject,
550
- cameraId: string
551
- ): SpatialReasoningResult {
550
+ cameraId: string,
551
+ mediaObject?: MediaObject
552
+ ): Promise<SpatialReasoningResult> {
552
553
  if (!this.topology) {
553
554
  return {
554
555
  description: `${this.capitalizeFirst(tracked.className)} entered property`,
@@ -571,11 +572,10 @@ export class SpatialReasoningEngine {
571
572
  const landmarks = getLandmarksVisibleFromCamera(this.topology, cameraId);
572
573
  const objectType = this.capitalizeFirst(tracked.className);
573
574
 
574
- // Build entry description using topology context
575
+ // Build basic entry description using topology context
575
576
  const location = this.describeLocation(camera, landmarks, 'to');
576
577
 
577
578
  // Check if we can determine where they came from (e.g., street, neighbor)
578
- const entryLandmark = landmarks.find(l => l.isEntryPoint);
579
579
  const streetLandmark = landmarks.find(l => l.type === 'street');
580
580
  const neighborLandmark = landmarks.find(l => l.type === 'neighbor');
581
581
 
@@ -586,8 +586,25 @@ export class SpatialReasoningEngine {
586
586
  source = ` from ${neighborLandmark.name}`;
587
587
  }
588
588
 
589
+ const basicDescription = `${objectType} arrived at ${location}${source}`;
590
+
591
+ // Try LLM for enhanced description with visual details
592
+ if (this.config.enableLlm && mediaObject) {
593
+ const llmDescription = await this.getLlmEntryExitDescription(
594
+ tracked, camera, landmarks, 'entry', mediaObject
595
+ );
596
+ if (llmDescription) {
597
+ return {
598
+ description: llmDescription,
599
+ involvedLandmarks: landmarks,
600
+ confidence: 0.9,
601
+ usedLlm: true,
602
+ };
603
+ }
604
+ }
605
+
589
606
  return {
590
- description: `${objectType} arrived at ${location}${source}`,
607
+ description: basicDescription,
591
608
  involvedLandmarks: landmarks,
592
609
  confidence: 0.8,
593
610
  usedLlm: false,
@@ -595,10 +612,11 @@ export class SpatialReasoningEngine {
595
612
  }
596
613
 
597
614
  /** Generate exit description when object leaves property */
598
- generateExitDescription(
615
+ async generateExitDescription(
599
616
  tracked: TrackedObject,
600
- cameraId: string
601
- ): SpatialReasoningResult {
617
+ cameraId: string,
618
+ mediaObject?: MediaObject
619
+ ): Promise<SpatialReasoningResult> {
602
620
  if (!this.topology) {
603
621
  return {
604
622
  description: `${this.capitalizeFirst(tracked.className)} left property`,
@@ -621,7 +639,7 @@ export class SpatialReasoningEngine {
621
639
  const landmarks = getLandmarksVisibleFromCamera(this.topology, cameraId);
622
640
  const objectType = this.capitalizeFirst(tracked.className);
623
641
 
624
- // Build exit description
642
+ // Build basic exit description
625
643
  const location = this.describeLocation(camera, landmarks, 'from');
626
644
 
627
645
  // Check for exit point landmarks
@@ -674,8 +692,25 @@ export class SpatialReasoningEngine {
674
692
  }
675
693
  }
676
694
 
695
+ const basicDescription = `${objectType} left ${location}${destination}${timeContext}${journeyContext}`;
696
+
697
+ // Try LLM for enhanced description with visual details
698
+ if (this.config.enableLlm && mediaObject) {
699
+ const llmDescription = await this.getLlmEntryExitDescription(
700
+ tracked, camera, landmarks, 'exit', mediaObject, journeyContext
701
+ );
702
+ if (llmDescription) {
703
+ return {
704
+ description: llmDescription,
705
+ involvedLandmarks: landmarks,
706
+ confidence: 0.9,
707
+ usedLlm: true,
708
+ };
709
+ }
710
+ }
711
+
677
712
  return {
678
- description: `${objectType} left ${location}${destination}${timeContext}${journeyContext}`,
713
+ description: basicDescription,
679
714
  involvedLandmarks: landmarks,
680
715
  confidence: 0.8,
681
716
  usedLlm: false,
@@ -952,6 +987,107 @@ export class SpatialReasoningEngine {
952
987
  }
953
988
  }
954
989
 
990
+ /** Get LLM-enhanced description for entry/exit events */
991
+ private async getLlmEntryExitDescription(
992
+ tracked: TrackedObject,
993
+ camera: CameraNode,
994
+ landmarks: Landmark[],
995
+ eventType: 'entry' | 'exit',
996
+ mediaObject: MediaObject,
997
+ journeyContext?: string
998
+ ): Promise<string | null> {
999
+ const llm = await this.findLlmDevice();
1000
+ if (!llm || !llm.getChatCompletion) return null;
1001
+
1002
+ try {
1003
+ // Convert image to base64 for vision LLM
1004
+ const imageData = await mediaObjectToBase64(mediaObject);
1005
+
1006
+ const landmarkNames = landmarks.map(l => l.name).join(', ') || 'none identified';
1007
+ const dwellTime = Math.round((tracked.lastSeen - tracked.firstSeen) / 1000);
1008
+
1009
+ // Build context-aware prompt
1010
+ const prompt = eventType === 'entry'
1011
+ ? `You are a security camera system. Analyze this image and describe who/what just arrived.
1012
+
1013
+ CONTEXT:
1014
+ - Camera: ${camera.name}
1015
+ - Object type: ${tracked.className}
1016
+ - Nearby landmarks: ${landmarkNames}
1017
+
1018
+ INSTRUCTIONS:
1019
+ Look at the image and generate a single, natural sentence describing:
1020
+ 1. Physical description (if person: gender, clothing, items carried; if vehicle: color, type, make)
1021
+ 2. What they appear to be doing (arriving, approaching, etc.)
1022
+ 3. Relevant landmark context (driveway, front door, mailbox, etc.)
1023
+
1024
+ Examples of good descriptions:
1025
+ - "Man in gray hoodie approaching the front door"
1026
+ - "Woman in scrubs arriving with shopping bags"
1027
+ - "White delivery van pulling into the driveway"
1028
+ - "UPS driver carrying package towards the porch"
1029
+ - "Teenager on bicycle coming up the driveway"
1030
+
1031
+ Generate ONLY the description, nothing else:`
1032
+ : `You are a security camera system. Analyze this image and describe who/what is leaving.
1033
+
1034
+ CONTEXT:
1035
+ - Camera: ${camera.name}
1036
+ - Object type: ${tracked.className}
1037
+ - Time on property: ${dwellTime > 60 ? Math.round(dwellTime / 60) + ' minutes' : dwellTime + ' seconds'}
1038
+ - Nearby landmarks: ${landmarkNames}
1039
+ ${journeyContext ? `- Journey: ${journeyContext}` : ''}
1040
+
1041
+ INSTRUCTIONS:
1042
+ Look at the image and generate a single, natural sentence describing:
1043
+ 1. Physical description (if person: gender, clothing, items carried; if vehicle: color, type)
1044
+ 2. What they did (if determinable from context)
1045
+ 3. Direction they're leaving towards
1046
+
1047
+ Examples of good descriptions:
1048
+ - "Man in black hoodie leaving after checking the mailbox"
1049
+ - "Woman in business attire heading to car in driveway"
1050
+ - "Red sedan backing out of the driveway"
1051
+ - "Delivery driver returning to FedEx truck after leaving package"
1052
+ - "Landscaper with leaf blower heading to work truck"
1053
+
1054
+ Generate ONLY the description, nothing else:`;
1055
+
1056
+ // Build message content - use multimodal format if we have an image
1057
+ let messageContent: any;
1058
+ if (imageData) {
1059
+ messageContent = [
1060
+ { type: 'text', text: prompt },
1061
+ buildImageContent(imageData, this.llmProviderType),
1062
+ ];
1063
+ } else {
1064
+ messageContent = prompt;
1065
+ }
1066
+
1067
+ // Call LLM using ChatCompletion interface
1068
+ const result = await llm.getChatCompletion({
1069
+ messages: [
1070
+ {
1071
+ role: 'user',
1072
+ content: messageContent,
1073
+ },
1074
+ ],
1075
+ max_tokens: 100,
1076
+ temperature: 0.7,
1077
+ });
1078
+
1079
+ const content = result?.choices?.[0]?.message?.content;
1080
+ if (content && typeof content === 'string') {
1081
+ return content.trim();
1082
+ }
1083
+
1084
+ return null;
1085
+ } catch (e) {
1086
+ this.console.warn(`LLM ${eventType} description generation failed:`, e);
1087
+ return null;
1088
+ }
1089
+ }
1090
+
955
1091
  /** Build LLM prompt with RAG context */
956
1092
  private buildLlmPrompt(
957
1093
  tracked: TrackedObject,
@@ -536,10 +536,24 @@ export class TrackingEngine {
536
536
  // Check if we've already alerted for this object
537
537
  if (this.isInAlertCooldown(globalId)) return;
538
538
 
539
- // Generate spatial description
540
- const spatialResult = this.spatialReasoning.generateEntryDescription(
539
+ // Get snapshot for LLM description (if LLM is enabled)
540
+ let mediaObject: MediaObject | undefined;
541
+ if (this.config.useLlmDescriptions) {
542
+ try {
543
+ const camera = systemManager.getDeviceById<Camera>(sighting.cameraId);
544
+ if (camera?.interfaces?.includes(ScryptedInterface.Camera)) {
545
+ mediaObject = await camera.takePicture();
546
+ }
547
+ } catch (e) {
548
+ this.console.warn('Failed to get snapshot for entry description:', e);
549
+ }
550
+ }
551
+
552
+ // Generate spatial description (now async with LLM support)
553
+ const spatialResult = await this.spatialReasoning.generateEntryDescription(
541
554
  tracked,
542
- sighting.cameraId
555
+ sighting.cameraId,
556
+ mediaObject
543
557
  );
544
558
 
545
559
  if (isEntryPoint) {
@@ -626,10 +640,24 @@ export class TrackingEngine {
626
640
  if (current && current.state === 'pending') {
627
641
  this.state.markExited(tracked.globalId, sighting.cameraId, sighting.cameraName);
628
642
 
629
- // Generate rich exit description using topology context
630
- const spatialResult = this.spatialReasoning.generateExitDescription(
643
+ // Get snapshot for LLM description (if LLM is enabled)
644
+ let mediaObject: MediaObject | undefined;
645
+ if (this.config.useLlmDescriptions) {
646
+ try {
647
+ const camera = systemManager.getDeviceById<Camera>(sighting.cameraId);
648
+ if (camera?.interfaces?.includes(ScryptedInterface.Camera)) {
649
+ mediaObject = await camera.takePicture();
650
+ }
651
+ } catch (e) {
652
+ this.console.warn('Failed to get snapshot for exit description:', e);
653
+ }
654
+ }
655
+
656
+ // Generate rich exit description using topology context (now async with LLM support)
657
+ const spatialResult = await this.spatialReasoning.generateExitDescription(
631
658
  current,
632
- sighting.cameraId
659
+ sighting.cameraId,
660
+ mediaObject
633
661
  );
634
662
 
635
663
  this.console.log(
package/src/main.ts CHANGED
@@ -1584,7 +1584,7 @@ export class SpatialAwarenessPlugin extends ScryptedDeviceBase
1584
1584
  }
1585
1585
  }
1586
1586
 
1587
- private handleTrainingEndRequest(response: HttpResponse): void {
1587
+ private async handleTrainingEndRequest(response: HttpResponse): Promise<void> {
1588
1588
  if (!this.trackingEngine) {
1589
1589
  response.send(JSON.stringify({ error: 'Tracking engine not running' }), {
1590
1590
  code: 500,
@@ -1595,6 +1595,44 @@ export class SpatialAwarenessPlugin extends ScryptedDeviceBase
1595
1595
 
1596
1596
  const session = this.trackingEngine.endTrainingSession();
1597
1597
  if (session) {
1598
+ // Get unique visited cameras
1599
+ const visitedCameraIds = [...new Set(session.visits.map(v => v.cameraId))];
1600
+
1601
+ // Auto-run discovery on visited cameras to detect landmarks and zones
1602
+ if (this.discoveryEngine && visitedCameraIds.length > 0) {
1603
+ this.console.log(`[Training] Running discovery analysis on ${visitedCameraIds.length} visited cameras...`);
1604
+
1605
+ let landmarksFound = 0;
1606
+ let zonesFound = 0;
1607
+
1608
+ for (const cameraId of visitedCameraIds) {
1609
+ try {
1610
+ const analysis = await this.discoveryEngine.analyzeScene(cameraId);
1611
+ if (analysis.isValid) {
1612
+ landmarksFound += analysis.landmarks.length;
1613
+ zonesFound += analysis.zones.length;
1614
+ this.console.log(`[Training] ${cameraId}: Found ${analysis.landmarks.length} landmarks, ${analysis.zones.length} zones`);
1615
+ }
1616
+ } catch (e) {
1617
+ this.console.warn(`[Training] Failed to analyze ${cameraId}:`, e);
1618
+ }
1619
+ }
1620
+
1621
+ // Get all pending suggestions and auto-accept them
1622
+ const suggestions = this.discoveryEngine.getPendingSuggestions();
1623
+ for (const suggestion of suggestions) {
1624
+ this.applyDiscoverySuggestion(suggestion);
1625
+ this.discoveryEngine.acceptSuggestion(suggestion.id);
1626
+ }
1627
+
1628
+ // Persist topology after applying suggestions
1629
+ if (suggestions.length > 0 && this.trackingEngine) {
1630
+ const updatedTopology = this.trackingEngine.getTopology();
1631
+ await this.storageSettings.putSetting('topology', JSON.stringify(updatedTopology));
1632
+ this.console.log(`[Training] Auto-applied ${suggestions.length} discoveries (${landmarksFound} landmarks, ${zonesFound} zones)`);
1633
+ }
1634
+ }
1635
+
1598
1636
  response.send(JSON.stringify(session), {
1599
1637
  headers: { 'Content-Type': 'application/json' },
1600
1638
  });