@blueharford/scrypted-spatial-awareness 0.6.7 → 0.6.9

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/plugin.zip CHANGED
Binary file
@@ -35497,7 +35497,7 @@ class SpatialReasoningEngine {
35497
35497
  return this.llmDevice !== null;
35498
35498
  }
35499
35499
  /** Generate entry description when object enters property */
35500
- generateEntryDescription(tracked, cameraId) {
35500
+ async generateEntryDescription(tracked, cameraId, mediaObject) {
35501
35501
  if (!this.topology) {
35502
35502
  return {
35503
35503
  description: `${this.capitalizeFirst(tracked.className)} entered property`,
@@ -35517,10 +35517,9 @@ class SpatialReasoningEngine {
35517
35517
  }
35518
35518
  const landmarks = (0, topology_1.getLandmarksVisibleFromCamera)(this.topology, cameraId);
35519
35519
  const objectType = this.capitalizeFirst(tracked.className);
35520
- // Build entry description using topology context
35520
+ // Build basic entry description using topology context
35521
35521
  const location = this.describeLocation(camera, landmarks, 'to');
35522
35522
  // Check if we can determine where they came from (e.g., street, neighbor)
35523
- const entryLandmark = landmarks.find(l => l.isEntryPoint);
35524
35523
  const streetLandmark = landmarks.find(l => l.type === 'street');
35525
35524
  const neighborLandmark = landmarks.find(l => l.type === 'neighbor');
35526
35525
  let source = '';
@@ -35530,15 +35529,35 @@ class SpatialReasoningEngine {
35530
35529
  else if (neighborLandmark) {
35531
35530
  source = ` from ${neighborLandmark.name}`;
35532
35531
  }
35532
+ const basicDescription = `${objectType} arrived at ${location}${source}`;
35533
+ // Try LLM for enhanced description with visual details
35534
+ this.console.log(`[Entry] enableLlm=${this.config.enableLlm}, hasMediaObject=${!!mediaObject}`);
35535
+ if (this.config.enableLlm && mediaObject) {
35536
+ this.console.log(`[Entry] Attempting LLM description for entry event`);
35537
+ const llmDescription = await this.getLlmEntryExitDescription(tracked, camera, landmarks, 'entry', mediaObject);
35538
+ if (llmDescription) {
35539
+ this.console.log(`[Entry] LLM returned: ${llmDescription.substring(0, 50)}...`);
35540
+ return {
35541
+ description: llmDescription,
35542
+ involvedLandmarks: landmarks,
35543
+ confidence: 0.9,
35544
+ usedLlm: true,
35545
+ };
35546
+ }
35547
+ this.console.warn(`[Entry] LLM returned null, falling back to basic`);
35548
+ }
35549
+ else {
35550
+ this.console.log(`[Entry] Skipping LLM (enableLlm=${this.config.enableLlm}, mediaObject=${!!mediaObject})`);
35551
+ }
35533
35552
  return {
35534
- description: `${objectType} arrived at ${location}${source}`,
35553
+ description: basicDescription,
35535
35554
  involvedLandmarks: landmarks,
35536
35555
  confidence: 0.8,
35537
35556
  usedLlm: false,
35538
35557
  };
35539
35558
  }
35540
35559
  /** Generate exit description when object leaves property */
35541
- generateExitDescription(tracked, cameraId) {
35560
+ async generateExitDescription(tracked, cameraId, mediaObject) {
35542
35561
  if (!this.topology) {
35543
35562
  return {
35544
35563
  description: `${this.capitalizeFirst(tracked.className)} left property`,
@@ -35558,7 +35577,7 @@ class SpatialReasoningEngine {
35558
35577
  }
35559
35578
  const landmarks = (0, topology_1.getLandmarksVisibleFromCamera)(this.topology, cameraId);
35560
35579
  const objectType = this.capitalizeFirst(tracked.className);
35561
- // Build exit description
35580
+ // Build basic exit description
35562
35581
  const location = this.describeLocation(camera, landmarks, 'from');
35563
35582
  // Check for exit point landmarks
35564
35583
  const exitLandmark = landmarks.find(l => l.isExitPoint);
@@ -35603,8 +35622,28 @@ class SpatialReasoningEngine {
35603
35622
  journeyContext = ` — visited ${visitedLandmarks.join(' → ')}`;
35604
35623
  }
35605
35624
  }
35625
+ const basicDescription = `${objectType} left ${location}${destination}${timeContext}${journeyContext}`;
35626
+ // Try LLM for enhanced description with visual details
35627
+ this.console.log(`[Exit] enableLlm=${this.config.enableLlm}, hasMediaObject=${!!mediaObject}`);
35628
+ if (this.config.enableLlm && mediaObject) {
35629
+ this.console.log(`[Exit] Attempting LLM description for exit event`);
35630
+ const llmDescription = await this.getLlmEntryExitDescription(tracked, camera, landmarks, 'exit', mediaObject, journeyContext);
35631
+ if (llmDescription) {
35632
+ this.console.log(`[Exit] LLM returned: ${llmDescription.substring(0, 50)}...`);
35633
+ return {
35634
+ description: llmDescription,
35635
+ involvedLandmarks: landmarks,
35636
+ confidence: 0.9,
35637
+ usedLlm: true,
35638
+ };
35639
+ }
35640
+ this.console.warn(`[Exit] LLM returned null, falling back to basic`);
35641
+ }
35642
+ else {
35643
+ this.console.log(`[Exit] Skipping LLM (enableLlm=${this.config.enableLlm}, mediaObject=${!!mediaObject})`);
35644
+ }
35606
35645
  return {
35607
- description: `${objectType} left ${location}${destination}${timeContext}${journeyContext}`,
35646
+ description: basicDescription,
35608
35647
  involvedLandmarks: landmarks,
35609
35648
  confidence: 0.8,
35610
35649
  usedLlm: false,
@@ -35801,6 +35840,107 @@ class SpatialReasoningEngine {
35801
35840
  return null;
35802
35841
  }
35803
35842
  }
35843
+ /** Get LLM-enhanced description for entry/exit events */
35844
+ async getLlmEntryExitDescription(tracked, camera, landmarks, eventType, mediaObject, journeyContext) {
35845
+ this.console.log(`[LLM] getLlmEntryExitDescription called for ${eventType} event`);
35846
+ const llm = await this.findLlmDevice();
35847
+ if (!llm) {
35848
+ this.console.warn(`[LLM] No LLM device found for ${eventType} description`);
35849
+ return null;
35850
+ }
35851
+ if (!llm.getChatCompletion) {
35852
+ this.console.warn(`[LLM] LLM device has no getChatCompletion method`);
35853
+ return null;
35854
+ }
35855
+ this.console.log(`[LLM] Using LLM device: ${this.llmProvider}`);
35856
+ try {
35857
+ // Convert image to base64 for vision LLM
35858
+ const imageData = await mediaObjectToBase64(mediaObject);
35859
+ this.console.log(`[LLM] Image converted: ${imageData ? 'success' : 'failed'}, type: ${imageData?.mediaType}`);
35860
+ const landmarkNames = landmarks.map(l => l.name).join(', ') || 'none identified';
35861
+ const dwellTime = Math.round((tracked.lastSeen - tracked.firstSeen) / 1000);
35862
+ // Build context-aware prompt
35863
+ const prompt = eventType === 'entry'
35864
+ ? `You are a security camera system. Analyze this image and describe who/what just arrived.
35865
+
35866
+ CONTEXT:
35867
+ - Camera: ${camera.name}
35868
+ - Object type: ${tracked.className}
35869
+ - Nearby landmarks: ${landmarkNames}
35870
+
35871
+ INSTRUCTIONS:
35872
+ Look at the image and generate a single, natural sentence describing:
35873
+ 1. Physical description (if person: gender, clothing, items carried; if vehicle: color, type, make)
35874
+ 2. What they appear to be doing (arriving, approaching, etc.)
35875
+ 3. Relevant landmark context (driveway, front door, mailbox, etc.)
35876
+
35877
+ Examples of good descriptions:
35878
+ - "Man in gray hoodie approaching the front door"
35879
+ - "Woman in scrubs arriving with shopping bags"
35880
+ - "White delivery van pulling into the driveway"
35881
+ - "UPS driver carrying package towards the porch"
35882
+ - "Teenager on bicycle coming up the driveway"
35883
+
35884
+ Generate ONLY the description, nothing else:`
35885
+ : `You are a security camera system. Analyze this image and describe who/what is leaving.
35886
+
35887
+ CONTEXT:
35888
+ - Camera: ${camera.name}
35889
+ - Object type: ${tracked.className}
35890
+ - Time on property: ${dwellTime > 60 ? Math.round(dwellTime / 60) + ' minutes' : dwellTime + ' seconds'}
35891
+ - Nearby landmarks: ${landmarkNames}
35892
+ ${journeyContext ? `- Journey: ${journeyContext}` : ''}
35893
+
35894
+ INSTRUCTIONS:
35895
+ Look at the image and generate a single, natural sentence describing:
35896
+ 1. Physical description (if person: gender, clothing, items carried; if vehicle: color, type)
35897
+ 2. What they did (if determinable from context)
35898
+ 3. Direction they're leaving towards
35899
+
35900
+ Examples of good descriptions:
35901
+ - "Man in black hoodie leaving after checking the mailbox"
35902
+ - "Woman in business attire heading to car in driveway"
35903
+ - "Red sedan backing out of the driveway"
35904
+ - "Delivery driver returning to FedEx truck after leaving package"
35905
+ - "Landscaper with leaf blower heading to work truck"
35906
+
35907
+ Generate ONLY the description, nothing else:`;
35908
+ // Build message content - use multimodal format if we have an image
35909
+ let messageContent;
35910
+ if (imageData) {
35911
+ messageContent = [
35912
+ { type: 'text', text: prompt },
35913
+ buildImageContent(imageData, this.llmProviderType),
35914
+ ];
35915
+ }
35916
+ else {
35917
+ messageContent = prompt;
35918
+ }
35919
+ // Call LLM using ChatCompletion interface
35920
+ this.console.log(`[LLM] Calling getChatCompletion for ${eventType}...`);
35921
+ const result = await llm.getChatCompletion({
35922
+ messages: [
35923
+ {
35924
+ role: 'user',
35925
+ content: messageContent,
35926
+ },
35927
+ ],
35928
+ max_tokens: 100,
35929
+ temperature: 0.7,
35930
+ });
35931
+ const content = result?.choices?.[0]?.message?.content;
35932
+ if (content && typeof content === 'string') {
35933
+ this.console.log(`[LLM] Got ${eventType} description: ${content.trim().substring(0, 50)}...`);
35934
+ return content.trim();
35935
+ }
35936
+ this.console.warn(`[LLM] No content in response for ${eventType}`);
35937
+ return null;
35938
+ }
35939
+ catch (e) {
35940
+ this.console.warn(`[LLM] ${eventType} description generation failed:`, e);
35941
+ return null;
35942
+ }
35943
+ }
35804
35944
  /** Build LLM prompt with RAG context */
35805
35945
  buildLlmPrompt(tracked, fromCamera, toCamera, transitTime, fromLandmarks, toLandmarks, ragContext) {
35806
35946
  const transitSecs = Math.round(transitTime / 1000);
@@ -37106,8 +37246,26 @@ class TrackingEngine {
37106
37246
  // Check if we've already alerted for this object
37107
37247
  if (this.isInAlertCooldown(globalId))
37108
37248
  return;
37109
- // Generate spatial description
37110
- const spatialResult = this.spatialReasoning.generateEntryDescription(tracked, sighting.cameraId);
37249
+ // Get snapshot for LLM description (if LLM is enabled)
37250
+ let mediaObject;
37251
+ this.console.log(`[Entry Alert] useLlmDescriptions=${this.config.useLlmDescriptions}`);
37252
+ if (this.config.useLlmDescriptions) {
37253
+ try {
37254
+ const camera = systemManager.getDeviceById(sighting.cameraId);
37255
+ this.console.log(`[Entry Alert] Camera ${sighting.cameraId} has Camera interface: ${camera?.interfaces?.includes(sdk_1.ScryptedInterface.Camera)}`);
37256
+ if (camera?.interfaces?.includes(sdk_1.ScryptedInterface.Camera)) {
37257
+ mediaObject = await camera.takePicture();
37258
+ this.console.log(`[Entry Alert] Got snapshot: ${!!mediaObject}`);
37259
+ }
37260
+ }
37261
+ catch (e) {
37262
+ this.console.warn('[Entry Alert] Failed to get snapshot:', e);
37263
+ }
37264
+ }
37265
+ // Generate spatial description (now async with LLM support)
37266
+ this.console.log(`[Entry Alert] Calling generateEntryDescription with mediaObject=${!!mediaObject}`);
37267
+ const spatialResult = await this.spatialReasoning.generateEntryDescription(tracked, sighting.cameraId, mediaObject);
37268
+ this.console.log(`[Entry Alert] Got description: "${spatialResult.description.substring(0, 60)}...", usedLlm=${spatialResult.usedLlm}`);
37111
37269
  if (isEntryPoint) {
37112
37270
  // Entry point - generate property entry alert
37113
37271
  await this.alertManager.checkAndAlert('property_entry', tracked, {
@@ -37182,9 +37340,26 @@ class TrackingEngine {
37182
37340
  const current = this.state.getObject(tracked.globalId);
37183
37341
  if (current && current.state === 'pending') {
37184
37342
  this.state.markExited(tracked.globalId, sighting.cameraId, sighting.cameraName);
37185
- // Generate rich exit description using topology context
37186
- const spatialResult = this.spatialReasoning.generateExitDescription(current, sighting.cameraId);
37187
- this.console.log(`Object ${tracked.globalId.slice(0, 8)} exited: ${spatialResult.description}`);
37343
+ // Get snapshot for LLM description (if LLM is enabled)
37344
+ let mediaObject;
37345
+ this.console.log(`[Exit Alert] useLlmDescriptions=${this.config.useLlmDescriptions}`);
37346
+ if (this.config.useLlmDescriptions) {
37347
+ try {
37348
+ const camera = systemManager.getDeviceById(sighting.cameraId);
37349
+ this.console.log(`[Exit Alert] Camera ${sighting.cameraId} has Camera interface: ${camera?.interfaces?.includes(sdk_1.ScryptedInterface.Camera)}`);
37350
+ if (camera?.interfaces?.includes(sdk_1.ScryptedInterface.Camera)) {
37351
+ mediaObject = await camera.takePicture();
37352
+ this.console.log(`[Exit Alert] Got snapshot: ${!!mediaObject}`);
37353
+ }
37354
+ }
37355
+ catch (e) {
37356
+ this.console.warn('[Exit Alert] Failed to get snapshot:', e);
37357
+ }
37358
+ }
37359
+ // Generate rich exit description using topology context (now async with LLM support)
37360
+ this.console.log(`[Exit Alert] Calling generateExitDescription with mediaObject=${!!mediaObject}`);
37361
+ const spatialResult = await this.spatialReasoning.generateExitDescription(current, sighting.cameraId, mediaObject);
37362
+ this.console.log(`[Exit Alert] Object ${tracked.globalId.slice(0, 8)} exited: "${spatialResult.description.substring(0, 60)}...", usedLlm=${spatialResult.usedLlm}`);
37188
37363
  await this.alertManager.checkAndAlert('property_exit', current, {
37189
37364
  cameraId: sighting.cameraId,
37190
37365
  cameraName: sighting.cameraName,