@blueharford/scrypted-spatial-awareness 0.6.7 → 0.6.9
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/main.nodejs.js +1 -1
- package/dist/main.nodejs.js.map +1 -1
- package/dist/plugin.zip +0 -0
- package/out/main.nodejs.js +187 -12
- package/out/main.nodejs.js.map +1 -1
- package/out/plugin.zip +0 -0
- package/package.json +1 -1
- package/src/core/spatial-reasoning.ts +174 -11
- package/src/core/tracking-engine.ts +44 -7
package/out/plugin.zip
CHANGED
|
Binary file
|
package/package.json
CHANGED
|
@@ -545,10 +545,11 @@ export class SpatialReasoningEngine {
|
|
|
545
545
|
}
|
|
546
546
|
|
|
547
547
|
/** Generate entry description when object enters property */
|
|
548
|
-
generateEntryDescription(
|
|
548
|
+
async generateEntryDescription(
|
|
549
549
|
tracked: TrackedObject,
|
|
550
|
-
cameraId: string
|
|
551
|
-
|
|
550
|
+
cameraId: string,
|
|
551
|
+
mediaObject?: MediaObject
|
|
552
|
+
): Promise<SpatialReasoningResult> {
|
|
552
553
|
if (!this.topology) {
|
|
553
554
|
return {
|
|
554
555
|
description: `${this.capitalizeFirst(tracked.className)} entered property`,
|
|
@@ -571,11 +572,10 @@ export class SpatialReasoningEngine {
|
|
|
571
572
|
const landmarks = getLandmarksVisibleFromCamera(this.topology, cameraId);
|
|
572
573
|
const objectType = this.capitalizeFirst(tracked.className);
|
|
573
574
|
|
|
574
|
-
// Build entry description using topology context
|
|
575
|
+
// Build basic entry description using topology context
|
|
575
576
|
const location = this.describeLocation(camera, landmarks, 'to');
|
|
576
577
|
|
|
577
578
|
// Check if we can determine where they came from (e.g., street, neighbor)
|
|
578
|
-
const entryLandmark = landmarks.find(l => l.isEntryPoint);
|
|
579
579
|
const streetLandmark = landmarks.find(l => l.type === 'street');
|
|
580
580
|
const neighborLandmark = landmarks.find(l => l.type === 'neighbor');
|
|
581
581
|
|
|
@@ -586,8 +586,31 @@ export class SpatialReasoningEngine {
|
|
|
586
586
|
source = ` from ${neighborLandmark.name}`;
|
|
587
587
|
}
|
|
588
588
|
|
|
589
|
+
const basicDescription = `${objectType} arrived at ${location}${source}`;
|
|
590
|
+
|
|
591
|
+
// Try LLM for enhanced description with visual details
|
|
592
|
+
this.console.log(`[Entry] enableLlm=${this.config.enableLlm}, hasMediaObject=${!!mediaObject}`);
|
|
593
|
+
if (this.config.enableLlm && mediaObject) {
|
|
594
|
+
this.console.log(`[Entry] Attempting LLM description for entry event`);
|
|
595
|
+
const llmDescription = await this.getLlmEntryExitDescription(
|
|
596
|
+
tracked, camera, landmarks, 'entry', mediaObject
|
|
597
|
+
);
|
|
598
|
+
if (llmDescription) {
|
|
599
|
+
this.console.log(`[Entry] LLM returned: ${llmDescription.substring(0, 50)}...`);
|
|
600
|
+
return {
|
|
601
|
+
description: llmDescription,
|
|
602
|
+
involvedLandmarks: landmarks,
|
|
603
|
+
confidence: 0.9,
|
|
604
|
+
usedLlm: true,
|
|
605
|
+
};
|
|
606
|
+
}
|
|
607
|
+
this.console.warn(`[Entry] LLM returned null, falling back to basic`);
|
|
608
|
+
} else {
|
|
609
|
+
this.console.log(`[Entry] Skipping LLM (enableLlm=${this.config.enableLlm}, mediaObject=${!!mediaObject})`);
|
|
610
|
+
}
|
|
611
|
+
|
|
589
612
|
return {
|
|
590
|
-
description:
|
|
613
|
+
description: basicDescription,
|
|
591
614
|
involvedLandmarks: landmarks,
|
|
592
615
|
confidence: 0.8,
|
|
593
616
|
usedLlm: false,
|
|
@@ -595,10 +618,11 @@ export class SpatialReasoningEngine {
|
|
|
595
618
|
}
|
|
596
619
|
|
|
597
620
|
/** Generate exit description when object leaves property */
|
|
598
|
-
generateExitDescription(
|
|
621
|
+
async generateExitDescription(
|
|
599
622
|
tracked: TrackedObject,
|
|
600
|
-
cameraId: string
|
|
601
|
-
|
|
623
|
+
cameraId: string,
|
|
624
|
+
mediaObject?: MediaObject
|
|
625
|
+
): Promise<SpatialReasoningResult> {
|
|
602
626
|
if (!this.topology) {
|
|
603
627
|
return {
|
|
604
628
|
description: `${this.capitalizeFirst(tracked.className)} left property`,
|
|
@@ -621,7 +645,7 @@ export class SpatialReasoningEngine {
|
|
|
621
645
|
const landmarks = getLandmarksVisibleFromCamera(this.topology, cameraId);
|
|
622
646
|
const objectType = this.capitalizeFirst(tracked.className);
|
|
623
647
|
|
|
624
|
-
// Build exit description
|
|
648
|
+
// Build basic exit description
|
|
625
649
|
const location = this.describeLocation(camera, landmarks, 'from');
|
|
626
650
|
|
|
627
651
|
// Check for exit point landmarks
|
|
@@ -674,8 +698,31 @@ export class SpatialReasoningEngine {
|
|
|
674
698
|
}
|
|
675
699
|
}
|
|
676
700
|
|
|
701
|
+
const basicDescription = `${objectType} left ${location}${destination}${timeContext}${journeyContext}`;
|
|
702
|
+
|
|
703
|
+
// Try LLM for enhanced description with visual details
|
|
704
|
+
this.console.log(`[Exit] enableLlm=${this.config.enableLlm}, hasMediaObject=${!!mediaObject}`);
|
|
705
|
+
if (this.config.enableLlm && mediaObject) {
|
|
706
|
+
this.console.log(`[Exit] Attempting LLM description for exit event`);
|
|
707
|
+
const llmDescription = await this.getLlmEntryExitDescription(
|
|
708
|
+
tracked, camera, landmarks, 'exit', mediaObject, journeyContext
|
|
709
|
+
);
|
|
710
|
+
if (llmDescription) {
|
|
711
|
+
this.console.log(`[Exit] LLM returned: ${llmDescription.substring(0, 50)}...`);
|
|
712
|
+
return {
|
|
713
|
+
description: llmDescription,
|
|
714
|
+
involvedLandmarks: landmarks,
|
|
715
|
+
confidence: 0.9,
|
|
716
|
+
usedLlm: true,
|
|
717
|
+
};
|
|
718
|
+
}
|
|
719
|
+
this.console.warn(`[Exit] LLM returned null, falling back to basic`);
|
|
720
|
+
} else {
|
|
721
|
+
this.console.log(`[Exit] Skipping LLM (enableLlm=${this.config.enableLlm}, mediaObject=${!!mediaObject})`);
|
|
722
|
+
}
|
|
723
|
+
|
|
677
724
|
return {
|
|
678
|
-
description:
|
|
725
|
+
description: basicDescription,
|
|
679
726
|
involvedLandmarks: landmarks,
|
|
680
727
|
confidence: 0.8,
|
|
681
728
|
usedLlm: false,
|
|
@@ -952,6 +999,122 @@ export class SpatialReasoningEngine {
|
|
|
952
999
|
}
|
|
953
1000
|
}
|
|
954
1001
|
|
|
1002
|
+
/** Get LLM-enhanced description for entry/exit events */
|
|
1003
|
+
private async getLlmEntryExitDescription(
|
|
1004
|
+
tracked: TrackedObject,
|
|
1005
|
+
camera: CameraNode,
|
|
1006
|
+
landmarks: Landmark[],
|
|
1007
|
+
eventType: 'entry' | 'exit',
|
|
1008
|
+
mediaObject: MediaObject,
|
|
1009
|
+
journeyContext?: string
|
|
1010
|
+
): Promise<string | null> {
|
|
1011
|
+
this.console.log(`[LLM] getLlmEntryExitDescription called for ${eventType} event`);
|
|
1012
|
+
|
|
1013
|
+
const llm = await this.findLlmDevice();
|
|
1014
|
+
if (!llm) {
|
|
1015
|
+
this.console.warn(`[LLM] No LLM device found for ${eventType} description`);
|
|
1016
|
+
return null;
|
|
1017
|
+
}
|
|
1018
|
+
if (!llm.getChatCompletion) {
|
|
1019
|
+
this.console.warn(`[LLM] LLM device has no getChatCompletion method`);
|
|
1020
|
+
return null;
|
|
1021
|
+
}
|
|
1022
|
+
|
|
1023
|
+
this.console.log(`[LLM] Using LLM device: ${this.llmProvider}`);
|
|
1024
|
+
|
|
1025
|
+
try {
|
|
1026
|
+
// Convert image to base64 for vision LLM
|
|
1027
|
+
const imageData = await mediaObjectToBase64(mediaObject);
|
|
1028
|
+
this.console.log(`[LLM] Image converted: ${imageData ? 'success' : 'failed'}, type: ${imageData?.mediaType}`);
|
|
1029
|
+
|
|
1030
|
+
const landmarkNames = landmarks.map(l => l.name).join(', ') || 'none identified';
|
|
1031
|
+
const dwellTime = Math.round((tracked.lastSeen - tracked.firstSeen) / 1000);
|
|
1032
|
+
|
|
1033
|
+
// Build context-aware prompt
|
|
1034
|
+
const prompt = eventType === 'entry'
|
|
1035
|
+
? `You are a security camera system. Analyze this image and describe who/what just arrived.
|
|
1036
|
+
|
|
1037
|
+
CONTEXT:
|
|
1038
|
+
- Camera: ${camera.name}
|
|
1039
|
+
- Object type: ${tracked.className}
|
|
1040
|
+
- Nearby landmarks: ${landmarkNames}
|
|
1041
|
+
|
|
1042
|
+
INSTRUCTIONS:
|
|
1043
|
+
Look at the image and generate a single, natural sentence describing:
|
|
1044
|
+
1. Physical description (if person: gender, clothing, items carried; if vehicle: color, type, make)
|
|
1045
|
+
2. What they appear to be doing (arriving, approaching, etc.)
|
|
1046
|
+
3. Relevant landmark context (driveway, front door, mailbox, etc.)
|
|
1047
|
+
|
|
1048
|
+
Examples of good descriptions:
|
|
1049
|
+
- "Man in gray hoodie approaching the front door"
|
|
1050
|
+
- "Woman in scrubs arriving with shopping bags"
|
|
1051
|
+
- "White delivery van pulling into the driveway"
|
|
1052
|
+
- "UPS driver carrying package towards the porch"
|
|
1053
|
+
- "Teenager on bicycle coming up the driveway"
|
|
1054
|
+
|
|
1055
|
+
Generate ONLY the description, nothing else:`
|
|
1056
|
+
: `You are a security camera system. Analyze this image and describe who/what is leaving.
|
|
1057
|
+
|
|
1058
|
+
CONTEXT:
|
|
1059
|
+
- Camera: ${camera.name}
|
|
1060
|
+
- Object type: ${tracked.className}
|
|
1061
|
+
- Time on property: ${dwellTime > 60 ? Math.round(dwellTime / 60) + ' minutes' : dwellTime + ' seconds'}
|
|
1062
|
+
- Nearby landmarks: ${landmarkNames}
|
|
1063
|
+
${journeyContext ? `- Journey: ${journeyContext}` : ''}
|
|
1064
|
+
|
|
1065
|
+
INSTRUCTIONS:
|
|
1066
|
+
Look at the image and generate a single, natural sentence describing:
|
|
1067
|
+
1. Physical description (if person: gender, clothing, items carried; if vehicle: color, type)
|
|
1068
|
+
2. What they did (if determinable from context)
|
|
1069
|
+
3. Direction they're leaving towards
|
|
1070
|
+
|
|
1071
|
+
Examples of good descriptions:
|
|
1072
|
+
- "Man in black hoodie leaving after checking the mailbox"
|
|
1073
|
+
- "Woman in business attire heading to car in driveway"
|
|
1074
|
+
- "Red sedan backing out of the driveway"
|
|
1075
|
+
- "Delivery driver returning to FedEx truck after leaving package"
|
|
1076
|
+
- "Landscaper with leaf blower heading to work truck"
|
|
1077
|
+
|
|
1078
|
+
Generate ONLY the description, nothing else:`;
|
|
1079
|
+
|
|
1080
|
+
// Build message content - use multimodal format if we have an image
|
|
1081
|
+
let messageContent: any;
|
|
1082
|
+
if (imageData) {
|
|
1083
|
+
messageContent = [
|
|
1084
|
+
{ type: 'text', text: prompt },
|
|
1085
|
+
buildImageContent(imageData, this.llmProviderType),
|
|
1086
|
+
];
|
|
1087
|
+
} else {
|
|
1088
|
+
messageContent = prompt;
|
|
1089
|
+
}
|
|
1090
|
+
|
|
1091
|
+
// Call LLM using ChatCompletion interface
|
|
1092
|
+
this.console.log(`[LLM] Calling getChatCompletion for ${eventType}...`);
|
|
1093
|
+
const result = await llm.getChatCompletion({
|
|
1094
|
+
messages: [
|
|
1095
|
+
{
|
|
1096
|
+
role: 'user',
|
|
1097
|
+
content: messageContent,
|
|
1098
|
+
},
|
|
1099
|
+
],
|
|
1100
|
+
max_tokens: 100,
|
|
1101
|
+
temperature: 0.7,
|
|
1102
|
+
});
|
|
1103
|
+
|
|
1104
|
+
const content = result?.choices?.[0]?.message?.content;
|
|
1105
|
+
if (content && typeof content === 'string') {
|
|
1106
|
+
this.console.log(`[LLM] Got ${eventType} description: ${content.trim().substring(0, 50)}...`);
|
|
1107
|
+
return content.trim();
|
|
1108
|
+
}
|
|
1109
|
+
|
|
1110
|
+
this.console.warn(`[LLM] No content in response for ${eventType}`);
|
|
1111
|
+
return null;
|
|
1112
|
+
} catch (e) {
|
|
1113
|
+
this.console.warn(`[LLM] ${eventType} description generation failed:`, e);
|
|
1114
|
+
return null;
|
|
1115
|
+
}
|
|
1116
|
+
}
|
|
1117
|
+
|
|
955
1118
|
/** Build LLM prompt with RAG context */
|
|
956
1119
|
private buildLlmPrompt(
|
|
957
1120
|
tracked: TrackedObject,
|
|
@@ -536,11 +536,30 @@ export class TrackingEngine {
|
|
|
536
536
|
// Check if we've already alerted for this object
|
|
537
537
|
if (this.isInAlertCooldown(globalId)) return;
|
|
538
538
|
|
|
539
|
-
//
|
|
540
|
-
|
|
539
|
+
// Get snapshot for LLM description (if LLM is enabled)
|
|
540
|
+
let mediaObject: MediaObject | undefined;
|
|
541
|
+
this.console.log(`[Entry Alert] useLlmDescriptions=${this.config.useLlmDescriptions}`);
|
|
542
|
+
if (this.config.useLlmDescriptions) {
|
|
543
|
+
try {
|
|
544
|
+
const camera = systemManager.getDeviceById<Camera>(sighting.cameraId);
|
|
545
|
+
this.console.log(`[Entry Alert] Camera ${sighting.cameraId} has Camera interface: ${camera?.interfaces?.includes(ScryptedInterface.Camera)}`);
|
|
546
|
+
if (camera?.interfaces?.includes(ScryptedInterface.Camera)) {
|
|
547
|
+
mediaObject = await camera.takePicture();
|
|
548
|
+
this.console.log(`[Entry Alert] Got snapshot: ${!!mediaObject}`);
|
|
549
|
+
}
|
|
550
|
+
} catch (e) {
|
|
551
|
+
this.console.warn('[Entry Alert] Failed to get snapshot:', e);
|
|
552
|
+
}
|
|
553
|
+
}
|
|
554
|
+
|
|
555
|
+
// Generate spatial description (now async with LLM support)
|
|
556
|
+
this.console.log(`[Entry Alert] Calling generateEntryDescription with mediaObject=${!!mediaObject}`);
|
|
557
|
+
const spatialResult = await this.spatialReasoning.generateEntryDescription(
|
|
541
558
|
tracked,
|
|
542
|
-
sighting.cameraId
|
|
559
|
+
sighting.cameraId,
|
|
560
|
+
mediaObject
|
|
543
561
|
);
|
|
562
|
+
this.console.log(`[Entry Alert] Got description: "${spatialResult.description.substring(0, 60)}...", usedLlm=${spatialResult.usedLlm}`);
|
|
544
563
|
|
|
545
564
|
if (isEntryPoint) {
|
|
546
565
|
// Entry point - generate property entry alert
|
|
@@ -626,14 +645,32 @@ export class TrackingEngine {
|
|
|
626
645
|
if (current && current.state === 'pending') {
|
|
627
646
|
this.state.markExited(tracked.globalId, sighting.cameraId, sighting.cameraName);
|
|
628
647
|
|
|
629
|
-
//
|
|
630
|
-
|
|
648
|
+
// Get snapshot for LLM description (if LLM is enabled)
|
|
649
|
+
let mediaObject: MediaObject | undefined;
|
|
650
|
+
this.console.log(`[Exit Alert] useLlmDescriptions=${this.config.useLlmDescriptions}`);
|
|
651
|
+
if (this.config.useLlmDescriptions) {
|
|
652
|
+
try {
|
|
653
|
+
const camera = systemManager.getDeviceById<Camera>(sighting.cameraId);
|
|
654
|
+
this.console.log(`[Exit Alert] Camera ${sighting.cameraId} has Camera interface: ${camera?.interfaces?.includes(ScryptedInterface.Camera)}`);
|
|
655
|
+
if (camera?.interfaces?.includes(ScryptedInterface.Camera)) {
|
|
656
|
+
mediaObject = await camera.takePicture();
|
|
657
|
+
this.console.log(`[Exit Alert] Got snapshot: ${!!mediaObject}`);
|
|
658
|
+
}
|
|
659
|
+
} catch (e) {
|
|
660
|
+
this.console.warn('[Exit Alert] Failed to get snapshot:', e);
|
|
661
|
+
}
|
|
662
|
+
}
|
|
663
|
+
|
|
664
|
+
// Generate rich exit description using topology context (now async with LLM support)
|
|
665
|
+
this.console.log(`[Exit Alert] Calling generateExitDescription with mediaObject=${!!mediaObject}`);
|
|
666
|
+
const spatialResult = await this.spatialReasoning.generateExitDescription(
|
|
631
667
|
current,
|
|
632
|
-
sighting.cameraId
|
|
668
|
+
sighting.cameraId,
|
|
669
|
+
mediaObject
|
|
633
670
|
);
|
|
634
671
|
|
|
635
672
|
this.console.log(
|
|
636
|
-
`Object ${tracked.globalId.slice(0, 8)} exited: ${spatialResult.description}`
|
|
673
|
+
`[Exit Alert] Object ${tracked.globalId.slice(0, 8)} exited: "${spatialResult.description.substring(0, 60)}...", usedLlm=${spatialResult.usedLlm}`
|
|
637
674
|
);
|
|
638
675
|
|
|
639
676
|
await this.alertManager.checkAndAlert('property_exit', current, {
|