@blueharford/scrypted-spatial-awareness 0.4.7 → 0.5.0-beta
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +156 -0
- package/package.json +1 -1
- package/src/core/object-correlator.ts +32 -7
- package/src/core/spatial-reasoning.ts +372 -45
- package/src/core/topology-discovery.ts +641 -0
- package/src/core/tracking-engine.ts +57 -19
- package/src/main.ts +289 -1
- package/src/models/alert.ts +41 -14
- package/src/models/discovery.ts +210 -0
- package/src/models/topology.ts +53 -0
- package/src/ui/editor-html.ts +467 -1
- package/dist/main.nodejs.js +0 -3
- package/dist/main.nodejs.js.LICENSE.txt +0 -1
- package/dist/main.nodejs.js.map +0 -1
- package/dist/plugin.zip +0 -0
- package/out/main.nodejs.js +0 -42336
- package/out/main.nodejs.js.map +0 -1
|
@@ -9,6 +9,8 @@ import sdk, {
|
|
|
9
9
|
ObjectDetection,
|
|
10
10
|
Camera,
|
|
11
11
|
MediaObject,
|
|
12
|
+
ScryptedDevice,
|
|
13
|
+
ScryptedMimeTypes,
|
|
12
14
|
} from '@scrypted/sdk';
|
|
13
15
|
import {
|
|
14
16
|
CameraTopology,
|
|
@@ -25,7 +27,7 @@ import {
|
|
|
25
27
|
} from '../models/topology';
|
|
26
28
|
import { TrackedObject, ObjectSighting } from '../models/tracked-object';
|
|
27
29
|
|
|
28
|
-
const { systemManager } = sdk;
|
|
30
|
+
const { systemManager, mediaManager } = sdk;
|
|
29
31
|
|
|
30
32
|
/** Configuration for the spatial reasoning engine */
|
|
31
33
|
export interface SpatialReasoningConfig {
|
|
@@ -61,11 +63,40 @@ interface ContextChunk {
|
|
|
61
63
|
metadata: Record<string, any>;
|
|
62
64
|
}
|
|
63
65
|
|
|
66
|
+
/** Interface for ChatCompletion devices (from @scrypted/llm plugin) */
|
|
67
|
+
interface ChatCompletionDevice extends ScryptedDevice {
|
|
68
|
+
getChatCompletion?(params: any): Promise<any>;
|
|
69
|
+
streamChatCompletion?(params: any): AsyncGenerator<any>;
|
|
70
|
+
}
|
|
71
|
+
|
|
72
|
+
/**
|
|
73
|
+
* Convert a MediaObject to a base64 data URL for vision LLM consumption
|
|
74
|
+
* @param mediaObject - MediaObject from camera.takePicture()
|
|
75
|
+
* @returns Base64 data URL (data:image/jpeg;base64,...) or null if conversion fails
|
|
76
|
+
*/
|
|
77
|
+
export async function mediaObjectToBase64(mediaObject: MediaObject): Promise<string | null> {
|
|
78
|
+
try {
|
|
79
|
+
// Convert MediaObject to Buffer using mediaManager
|
|
80
|
+
const buffer = await mediaManager.convertMediaObjectToBuffer(mediaObject, ScryptedMimeTypes.Image);
|
|
81
|
+
|
|
82
|
+
// Convert buffer to base64
|
|
83
|
+
const base64 = buffer.toString('base64');
|
|
84
|
+
|
|
85
|
+
// Determine MIME type - default to JPEG for camera images
|
|
86
|
+
const mimeType = mediaObject.mimeType?.split(';')[0] || 'image/jpeg';
|
|
87
|
+
|
|
88
|
+
return `data:${mimeType};base64,${base64}`;
|
|
89
|
+
} catch (e) {
|
|
90
|
+
console.warn('Failed to convert MediaObject to base64:', e);
|
|
91
|
+
return null;
|
|
92
|
+
}
|
|
93
|
+
}
|
|
94
|
+
|
|
64
95
|
export class SpatialReasoningEngine {
|
|
65
96
|
private config: SpatialReasoningConfig;
|
|
66
97
|
private console: Console;
|
|
67
98
|
private topology: CameraTopology | null = null;
|
|
68
|
-
private llmDevice:
|
|
99
|
+
private llmDevice: ChatCompletionDevice | null = null;
|
|
69
100
|
private contextChunks: ContextChunk[] = [];
|
|
70
101
|
private topologyContextCache: string | null = null;
|
|
71
102
|
private contextCacheTime: number = 0;
|
|
@@ -303,30 +334,213 @@ export class SpatialReasoningEngine {
|
|
|
303
334
|
return relevant;
|
|
304
335
|
}
|
|
305
336
|
|
|
306
|
-
|
|
307
|
-
private
|
|
337
|
+
private llmSearched: boolean = false;
|
|
338
|
+
private llmProvider: string | null = null;
|
|
339
|
+
|
|
340
|
+
/** Find or initialize LLM device - looks for ChatCompletion interface from @scrypted/llm plugin */
|
|
341
|
+
private async findLlmDevice(): Promise<ChatCompletionDevice | null> {
|
|
308
342
|
if (this.llmDevice) return this.llmDevice;
|
|
343
|
+
if (this.llmSearched) return null; // Already searched and found nothing
|
|
344
|
+
|
|
345
|
+
this.llmSearched = true;
|
|
309
346
|
|
|
310
347
|
try {
|
|
348
|
+
// Look for devices with ChatCompletion interface (the correct interface for @scrypted/llm)
|
|
311
349
|
for (const id of Object.keys(systemManager.getSystemState())) {
|
|
312
350
|
const device = systemManager.getDeviceById(id);
|
|
313
|
-
if (device
|
|
314
|
-
|
|
315
|
-
|
|
316
|
-
|
|
317
|
-
|
|
318
|
-
|
|
319
|
-
|
|
351
|
+
if (!device) continue;
|
|
352
|
+
|
|
353
|
+
// Check if this device has ChatCompletion interface
|
|
354
|
+
// The @scrypted/llm plugin exposes ChatCompletion, not ObjectDetection
|
|
355
|
+
if (device.interfaces?.includes('ChatCompletion')) {
|
|
356
|
+
const deviceName = device.name?.toLowerCase() || '';
|
|
357
|
+
const pluginId = (device as any).pluginId?.toLowerCase() || '';
|
|
358
|
+
|
|
359
|
+
// Identify the provider type for logging
|
|
360
|
+
let providerType = 'Unknown';
|
|
361
|
+
if (pluginId.includes('@scrypted/llm') || pluginId.includes('llm')) {
|
|
362
|
+
providerType = 'Scrypted LLM';
|
|
363
|
+
}
|
|
364
|
+
if (deviceName.includes('openai') || deviceName.includes('gpt')) {
|
|
365
|
+
providerType = 'OpenAI';
|
|
366
|
+
} else if (deviceName.includes('anthropic') || deviceName.includes('claude')) {
|
|
367
|
+
providerType = 'Anthropic';
|
|
368
|
+
} else if (deviceName.includes('ollama')) {
|
|
369
|
+
providerType = 'Ollama';
|
|
370
|
+
} else if (deviceName.includes('gemini') || deviceName.includes('google')) {
|
|
371
|
+
providerType = 'Google';
|
|
372
|
+
} else if (deviceName.includes('llama')) {
|
|
373
|
+
providerType = 'llama.cpp';
|
|
320
374
|
}
|
|
375
|
+
|
|
376
|
+
this.llmDevice = device as unknown as ChatCompletionDevice;
|
|
377
|
+
this.llmProvider = `${providerType} (${device.name})`;
|
|
378
|
+
this.console.log(`[LLM] Connected to ${providerType}: ${device.name}`);
|
|
379
|
+
this.console.log(`[LLM] Plugin: ${pluginId || 'N/A'}`);
|
|
380
|
+
this.console.log(`[LLM] Interfaces: ${device.interfaces?.join(', ')}`);
|
|
381
|
+
return this.llmDevice;
|
|
321
382
|
}
|
|
322
383
|
}
|
|
384
|
+
|
|
385
|
+
// If we get here, no LLM plugin found
|
|
386
|
+
this.console.warn('[LLM] No ChatCompletion device found. Install @scrypted/llm for enhanced descriptions.');
|
|
387
|
+
this.console.warn('[LLM] Falling back to rule-based descriptions using topology data.');
|
|
388
|
+
|
|
323
389
|
} catch (e) {
|
|
324
|
-
this.console.
|
|
390
|
+
this.console.error('[LLM] Error searching for LLM device:', e);
|
|
325
391
|
}
|
|
326
392
|
|
|
327
393
|
return null;
|
|
328
394
|
}
|
|
329
395
|
|
|
396
|
+
/** Get the current LLM provider name */
|
|
397
|
+
getLlmProvider(): string | null {
|
|
398
|
+
return this.llmProvider;
|
|
399
|
+
}
|
|
400
|
+
|
|
401
|
+
/** Check if LLM is available */
|
|
402
|
+
isLlmAvailable(): boolean {
|
|
403
|
+
return this.llmDevice !== null;
|
|
404
|
+
}
|
|
405
|
+
|
|
406
|
+
/** Generate entry description when object enters property */
|
|
407
|
+
generateEntryDescription(
|
|
408
|
+
tracked: TrackedObject,
|
|
409
|
+
cameraId: string
|
|
410
|
+
): SpatialReasoningResult {
|
|
411
|
+
if (!this.topology) {
|
|
412
|
+
return {
|
|
413
|
+
description: `${this.capitalizeFirst(tracked.className)} entered property`,
|
|
414
|
+
involvedLandmarks: [],
|
|
415
|
+
confidence: 0.5,
|
|
416
|
+
usedLlm: false,
|
|
417
|
+
};
|
|
418
|
+
}
|
|
419
|
+
|
|
420
|
+
const camera = findCamera(this.topology, cameraId);
|
|
421
|
+
if (!camera) {
|
|
422
|
+
return {
|
|
423
|
+
description: `${this.capitalizeFirst(tracked.className)} entered property`,
|
|
424
|
+
involvedLandmarks: [],
|
|
425
|
+
confidence: 0.5,
|
|
426
|
+
usedLlm: false,
|
|
427
|
+
};
|
|
428
|
+
}
|
|
429
|
+
|
|
430
|
+
const landmarks = getLandmarksVisibleFromCamera(this.topology, cameraId);
|
|
431
|
+
const objectType = this.capitalizeFirst(tracked.className);
|
|
432
|
+
|
|
433
|
+
// Build entry description using topology context
|
|
434
|
+
const location = this.describeLocation(camera, landmarks, 'to');
|
|
435
|
+
|
|
436
|
+
// Check if we can determine where they came from (e.g., street, neighbor)
|
|
437
|
+
const entryLandmark = landmarks.find(l => l.isEntryPoint);
|
|
438
|
+
const streetLandmark = landmarks.find(l => l.type === 'street');
|
|
439
|
+
const neighborLandmark = landmarks.find(l => l.type === 'neighbor');
|
|
440
|
+
|
|
441
|
+
let source = '';
|
|
442
|
+
if (streetLandmark) {
|
|
443
|
+
source = ` from ${streetLandmark.name}`;
|
|
444
|
+
} else if (neighborLandmark) {
|
|
445
|
+
source = ` from ${neighborLandmark.name}`;
|
|
446
|
+
}
|
|
447
|
+
|
|
448
|
+
return {
|
|
449
|
+
description: `${objectType} arrived at ${location}${source}`,
|
|
450
|
+
involvedLandmarks: landmarks,
|
|
451
|
+
confidence: 0.8,
|
|
452
|
+
usedLlm: false,
|
|
453
|
+
};
|
|
454
|
+
}
|
|
455
|
+
|
|
456
|
+
/** Generate exit description when object leaves property */
|
|
457
|
+
generateExitDescription(
|
|
458
|
+
tracked: TrackedObject,
|
|
459
|
+
cameraId: string
|
|
460
|
+
): SpatialReasoningResult {
|
|
461
|
+
if (!this.topology) {
|
|
462
|
+
return {
|
|
463
|
+
description: `${this.capitalizeFirst(tracked.className)} left property`,
|
|
464
|
+
involvedLandmarks: [],
|
|
465
|
+
confidence: 0.5,
|
|
466
|
+
usedLlm: false,
|
|
467
|
+
};
|
|
468
|
+
}
|
|
469
|
+
|
|
470
|
+
const camera = findCamera(this.topology, cameraId);
|
|
471
|
+
if (!camera) {
|
|
472
|
+
return {
|
|
473
|
+
description: `${this.capitalizeFirst(tracked.className)} left property`,
|
|
474
|
+
involvedLandmarks: [],
|
|
475
|
+
confidence: 0.5,
|
|
476
|
+
usedLlm: false,
|
|
477
|
+
};
|
|
478
|
+
}
|
|
479
|
+
|
|
480
|
+
const landmarks = getLandmarksVisibleFromCamera(this.topology, cameraId);
|
|
481
|
+
const objectType = this.capitalizeFirst(tracked.className);
|
|
482
|
+
|
|
483
|
+
// Build exit description
|
|
484
|
+
const location = this.describeLocation(camera, landmarks, 'from');
|
|
485
|
+
|
|
486
|
+
// Check for exit point landmarks
|
|
487
|
+
const exitLandmark = landmarks.find(l => l.isExitPoint);
|
|
488
|
+
const streetLandmark = landmarks.find(l => l.type === 'street');
|
|
489
|
+
|
|
490
|
+
let destination = '';
|
|
491
|
+
if (streetLandmark) {
|
|
492
|
+
destination = ` towards ${streetLandmark.name}`;
|
|
493
|
+
} else if (exitLandmark) {
|
|
494
|
+
destination = ` via ${exitLandmark.name}`;
|
|
495
|
+
}
|
|
496
|
+
|
|
497
|
+
// Include time on property if available
|
|
498
|
+
const dwellTime = Math.round((tracked.lastSeen - tracked.firstSeen) / 1000);
|
|
499
|
+
let timeContext = '';
|
|
500
|
+
if (dwellTime > 60) {
|
|
501
|
+
timeContext = ` after ${Math.round(dwellTime / 60)}m on property`;
|
|
502
|
+
} else if (dwellTime > 10) {
|
|
503
|
+
timeContext = ` after ${dwellTime}s`;
|
|
504
|
+
}
|
|
505
|
+
|
|
506
|
+
// Summarize journey if they visited multiple cameras (use landmarks from topology)
|
|
507
|
+
let journeyContext = '';
|
|
508
|
+
if (tracked.journey.length > 0 && this.topology) {
|
|
509
|
+
const visitedLandmarks: string[] = [];
|
|
510
|
+
|
|
511
|
+
// Get landmarks from entry camera
|
|
512
|
+
if (tracked.entryCamera) {
|
|
513
|
+
const entryLandmarks = getLandmarksVisibleFromCamera(this.topology, tracked.entryCamera);
|
|
514
|
+
const entryLandmark = entryLandmarks.find(l => l.isEntryPoint || l.type === 'access') || entryLandmarks[0];
|
|
515
|
+
if (entryLandmark) {
|
|
516
|
+
visitedLandmarks.push(entryLandmark.name);
|
|
517
|
+
}
|
|
518
|
+
}
|
|
519
|
+
|
|
520
|
+
// Get landmarks from journey segments
|
|
521
|
+
for (const segment of tracked.journey) {
|
|
522
|
+
const segmentLandmarks = getLandmarksVisibleFromCamera(this.topology, segment.toCameraId);
|
|
523
|
+
const segmentLandmark = segmentLandmarks.find(l =>
|
|
524
|
+
!visitedLandmarks.includes(l.name) && (l.type === 'access' || l.type === 'zone' || l.type === 'structure')
|
|
525
|
+
);
|
|
526
|
+
if (segmentLandmark && !visitedLandmarks.includes(segmentLandmark.name)) {
|
|
527
|
+
visitedLandmarks.push(segmentLandmark.name);
|
|
528
|
+
}
|
|
529
|
+
}
|
|
530
|
+
|
|
531
|
+
if (visitedLandmarks.length > 1) {
|
|
532
|
+
journeyContext = ` — visited ${visitedLandmarks.join(' → ')}`;
|
|
533
|
+
}
|
|
534
|
+
}
|
|
535
|
+
|
|
536
|
+
return {
|
|
537
|
+
description: `${objectType} left ${location}${destination}${timeContext}${journeyContext}`,
|
|
538
|
+
involvedLandmarks: landmarks,
|
|
539
|
+
confidence: 0.8,
|
|
540
|
+
usedLlm: false,
|
|
541
|
+
};
|
|
542
|
+
}
|
|
543
|
+
|
|
330
544
|
/** Generate rich movement description using LLM */
|
|
331
545
|
async generateMovementDescription(
|
|
332
546
|
tracked: TrackedObject,
|
|
@@ -415,28 +629,92 @@ export class SpatialReasoningEngine {
|
|
|
415
629
|
const objectType = this.capitalizeFirst(tracked.className);
|
|
416
630
|
const transitSecs = Math.round(transitTime / 1000);
|
|
417
631
|
|
|
418
|
-
//
|
|
419
|
-
|
|
420
|
-
|
|
421
|
-
|
|
422
|
-
|
|
423
|
-
} else if (fromCamera.context?.coverageDescription) {
|
|
424
|
-
origin = fromCamera.context.coverageDescription.split('.')[0];
|
|
425
|
-
}
|
|
632
|
+
// Get connection for path context
|
|
633
|
+
const connection = this.topology ? findConnection(this.topology, fromCamera.deviceId, toCamera.deviceId) : null;
|
|
634
|
+
|
|
635
|
+
// Build origin description using landmarks, camera context, or camera name
|
|
636
|
+
let origin = this.describeLocation(fromCamera, fromLandmarks, 'from');
|
|
426
637
|
|
|
427
638
|
// Build destination description
|
|
428
|
-
let destination = toCamera
|
|
429
|
-
|
|
430
|
-
|
|
431
|
-
|
|
432
|
-
|
|
433
|
-
|
|
639
|
+
let destination = this.describeLocation(toCamera, toLandmarks, 'to');
|
|
640
|
+
|
|
641
|
+
// Check if we have a named path/connection
|
|
642
|
+
let pathContext = '';
|
|
643
|
+
if (connection?.name) {
|
|
644
|
+
pathContext = ` via ${connection.name}`;
|
|
645
|
+
} else if (connection?.pathLandmarks?.length && this.topology) {
|
|
646
|
+
const pathNames = connection.pathLandmarks
|
|
647
|
+
.map(id => findLandmark(this.topology!, id)?.name)
|
|
648
|
+
.filter(Boolean);
|
|
649
|
+
if (pathNames.length > 0) {
|
|
650
|
+
pathContext = ` past ${pathNames.join(' and ')}`;
|
|
651
|
+
}
|
|
434
652
|
}
|
|
435
653
|
|
|
436
|
-
//
|
|
437
|
-
|
|
654
|
+
// Include journey context if this is not the first camera
|
|
655
|
+
let journeyContext = '';
|
|
656
|
+
if (tracked.journey.length > 0) {
|
|
657
|
+
const totalTime = Math.round((Date.now() - tracked.firstSeen) / 1000);
|
|
658
|
+
if (totalTime > 60) {
|
|
659
|
+
journeyContext = ` (${Math.round(totalTime / 60)}m on property)`;
|
|
660
|
+
}
|
|
661
|
+
}
|
|
662
|
+
|
|
663
|
+
// Determine movement verb based on transit time and object type
|
|
664
|
+
const verb = this.getMovementVerb(tracked.className, transitSecs);
|
|
665
|
+
|
|
666
|
+
return `${objectType} ${verb} ${origin} heading ${destination}${pathContext}${journeyContext}`;
|
|
667
|
+
}
|
|
668
|
+
|
|
669
|
+
/** Describe a location using landmarks, camera context, or camera name */
|
|
670
|
+
private describeLocation(camera: CameraNode, landmarks: Landmark[], direction: 'from' | 'to'): string {
|
|
671
|
+
// Priority 1: Use entry/exit landmarks
|
|
672
|
+
const entryExitLandmark = landmarks.find(l =>
|
|
673
|
+
(direction === 'from' && l.isExitPoint) || (direction === 'to' && l.isEntryPoint)
|
|
674
|
+
);
|
|
675
|
+
if (entryExitLandmark) {
|
|
676
|
+
return direction === 'from' ? `the ${entryExitLandmark.name}` : `the ${entryExitLandmark.name}`;
|
|
677
|
+
}
|
|
678
|
+
|
|
679
|
+
// Priority 2: Use access landmarks (driveway, walkway, etc.)
|
|
680
|
+
const accessLandmark = landmarks.find(l => l.type === 'access');
|
|
681
|
+
if (accessLandmark) {
|
|
682
|
+
return `the ${accessLandmark.name}`;
|
|
683
|
+
}
|
|
684
|
+
|
|
685
|
+
// Priority 3: Use zone landmarks (front yard, back yard)
|
|
686
|
+
const zoneLandmark = landmarks.find(l => l.type === 'zone');
|
|
687
|
+
if (zoneLandmark) {
|
|
688
|
+
return `the ${zoneLandmark.name}`;
|
|
689
|
+
}
|
|
690
|
+
|
|
691
|
+
// Priority 4: Use any landmark
|
|
692
|
+
if (landmarks.length > 0) {
|
|
693
|
+
return `near ${landmarks[0].name}`;
|
|
694
|
+
}
|
|
695
|
+
|
|
696
|
+
// Priority 5: Use camera coverage description
|
|
697
|
+
if (camera.context?.coverageDescription) {
|
|
698
|
+
const desc = camera.context.coverageDescription.split('.')[0].toLowerCase();
|
|
699
|
+
return `the ${desc}`;
|
|
700
|
+
}
|
|
701
|
+
|
|
702
|
+
// Fallback: Generic description (no camera name inference - use topology for context)
|
|
703
|
+
return direction === 'from' ? 'property' : 'property';
|
|
704
|
+
}
|
|
438
705
|
|
|
439
|
-
|
|
706
|
+
/** Get appropriate movement verb based on context */
|
|
707
|
+
private getMovementVerb(className: string, transitSecs: number): string {
|
|
708
|
+
if (className === 'car' || className === 'vehicle' || className === 'truck') {
|
|
709
|
+
return transitSecs < 10 ? 'driving from' : 'moved from';
|
|
710
|
+
}
|
|
711
|
+
if (transitSecs < 5) {
|
|
712
|
+
return 'walking from';
|
|
713
|
+
}
|
|
714
|
+
if (transitSecs < 30) {
|
|
715
|
+
return 'moved from';
|
|
716
|
+
}
|
|
717
|
+
return 'traveled from';
|
|
440
718
|
}
|
|
441
719
|
|
|
442
720
|
/** Build path description from connection */
|
|
@@ -458,7 +736,7 @@ export class SpatialReasoningEngine {
|
|
|
458
736
|
return connection.name || undefined;
|
|
459
737
|
}
|
|
460
738
|
|
|
461
|
-
/** Get LLM-enhanced description */
|
|
739
|
+
/** Get LLM-enhanced description using ChatCompletion interface with vision support */
|
|
462
740
|
private async getLlmEnhancedDescription(
|
|
463
741
|
tracked: TrackedObject,
|
|
464
742
|
fromCamera: CameraNode,
|
|
@@ -469,9 +747,12 @@ export class SpatialReasoningEngine {
|
|
|
469
747
|
mediaObject: MediaObject
|
|
470
748
|
): Promise<string | null> {
|
|
471
749
|
const llm = await this.findLlmDevice();
|
|
472
|
-
if (!llm) return null;
|
|
750
|
+
if (!llm || !llm.getChatCompletion) return null;
|
|
473
751
|
|
|
474
752
|
try {
|
|
753
|
+
// Convert image to base64 for vision LLM
|
|
754
|
+
const imageBase64 = await mediaObjectToBase64(mediaObject);
|
|
755
|
+
|
|
475
756
|
// Retrieve relevant context for RAG
|
|
476
757
|
const relevantChunks = this.retrieveRelevantContext(
|
|
477
758
|
fromCamera.deviceId,
|
|
@@ -492,14 +773,35 @@ export class SpatialReasoningEngine {
|
|
|
492
773
|
ragContext
|
|
493
774
|
);
|
|
494
775
|
|
|
495
|
-
//
|
|
496
|
-
|
|
497
|
-
|
|
498
|
-
|
|
776
|
+
// Build message content - use multimodal format if we have an image
|
|
777
|
+
let messageContent: any;
|
|
778
|
+
if (imageBase64) {
|
|
779
|
+
// Vision-capable multimodal message format (OpenAI compatible)
|
|
780
|
+
messageContent = [
|
|
781
|
+
{ type: 'text', text: prompt },
|
|
782
|
+
{ type: 'image_url', image_url: { url: imageBase64 } },
|
|
783
|
+
];
|
|
784
|
+
} else {
|
|
785
|
+
// Fallback to text-only if image conversion failed
|
|
786
|
+
messageContent = prompt;
|
|
787
|
+
}
|
|
788
|
+
|
|
789
|
+
// Call LLM using ChatCompletion interface
|
|
790
|
+
const result = await llm.getChatCompletion({
|
|
791
|
+
messages: [
|
|
792
|
+
{
|
|
793
|
+
role: 'user',
|
|
794
|
+
content: messageContent,
|
|
795
|
+
},
|
|
796
|
+
],
|
|
797
|
+
max_tokens: 150,
|
|
798
|
+
temperature: 0.7,
|
|
799
|
+
});
|
|
499
800
|
|
|
500
|
-
// Extract description from result
|
|
501
|
-
|
|
502
|
-
|
|
801
|
+
// Extract description from ChatCompletion result
|
|
802
|
+
const content = result?.choices?.[0]?.message?.content;
|
|
803
|
+
if (content && typeof content === 'string') {
|
|
804
|
+
return content.trim();
|
|
503
805
|
}
|
|
504
806
|
|
|
505
807
|
return null;
|
|
@@ -547,7 +849,7 @@ Examples of good descriptions:
|
|
|
547
849
|
Generate ONLY the description, nothing else:`;
|
|
548
850
|
}
|
|
549
851
|
|
|
550
|
-
/** Suggest a new landmark based on AI analysis */
|
|
852
|
+
/** Suggest a new landmark based on AI analysis using ChatCompletion with vision */
|
|
551
853
|
async suggestLandmark(
|
|
552
854
|
cameraId: string,
|
|
553
855
|
mediaObject: MediaObject,
|
|
@@ -557,9 +859,12 @@ Generate ONLY the description, nothing else:`;
|
|
|
557
859
|
if (!this.config.enableLandmarkLearning) return null;
|
|
558
860
|
|
|
559
861
|
const llm = await this.findLlmDevice();
|
|
560
|
-
if (!llm) return null;
|
|
862
|
+
if (!llm || !llm.getChatCompletion) return null;
|
|
561
863
|
|
|
562
864
|
try {
|
|
865
|
+
// Convert image to base64 for vision LLM
|
|
866
|
+
const imageBase64 = await mediaObjectToBase64(mediaObject);
|
|
867
|
+
|
|
563
868
|
const prompt = `Analyze this security camera image. A ${objectClass} was detected.
|
|
564
869
|
|
|
565
870
|
Looking at the surroundings and environment, identify any notable landmarks or features visible that could help describe this location. Consider:
|
|
@@ -573,13 +878,35 @@ If you can identify a clear landmark feature, respond with ONLY a JSON object:
|
|
|
573
878
|
|
|
574
879
|
If no clear landmark is identifiable, respond with: {"name": null}`;
|
|
575
880
|
|
|
576
|
-
|
|
577
|
-
|
|
578
|
-
|
|
881
|
+
// Build message content - use multimodal format if we have an image
|
|
882
|
+
let messageContent: any;
|
|
883
|
+
if (imageBase64) {
|
|
884
|
+
// Vision-capable multimodal message format (OpenAI compatible)
|
|
885
|
+
messageContent = [
|
|
886
|
+
{ type: 'text', text: prompt },
|
|
887
|
+
{ type: 'image_url', image_url: { url: imageBase64 } },
|
|
888
|
+
];
|
|
889
|
+
} else {
|
|
890
|
+
// Fallback to text-only if image conversion failed
|
|
891
|
+
messageContent = prompt;
|
|
892
|
+
}
|
|
893
|
+
|
|
894
|
+
// Call LLM using ChatCompletion interface
|
|
895
|
+
const result = await llm.getChatCompletion({
|
|
896
|
+
messages: [
|
|
897
|
+
{
|
|
898
|
+
role: 'user',
|
|
899
|
+
content: messageContent,
|
|
900
|
+
},
|
|
901
|
+
],
|
|
902
|
+
max_tokens: 100,
|
|
903
|
+
temperature: 0.3,
|
|
904
|
+
});
|
|
579
905
|
|
|
580
|
-
|
|
906
|
+
const content = result?.choices?.[0]?.message?.content;
|
|
907
|
+
if (content && typeof content === 'string') {
|
|
581
908
|
try {
|
|
582
|
-
const parsed = JSON.parse(
|
|
909
|
+
const parsed = JSON.parse(content.trim());
|
|
583
910
|
if (parsed.name && parsed.type) {
|
|
584
911
|
const suggestionId = `suggest_${Date.now()}_${Math.random().toString(36).slice(2, 8)}`;
|
|
585
912
|
|