@elizaos/plugin-vision 1.2.1 → 2.0.0-alpha.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (105) hide show
  1. package/build.config.ts +53 -53
  2. package/dist/index.js +6716 -67
  3. package/dist/index.js.map +33 -1
  4. package/dist/workers/florence2-worker.js +112304 -307
  5. package/dist/workers/florence2-worker.js.map +92 -1
  6. package/dist/workers/ocr-worker.js +119718 -339
  7. package/dist/workers/ocr-worker.js.map +137 -1
  8. package/dist/workers/screen-capture-worker.js +350 -418
  9. package/dist/workers/screen-capture-worker.js.map +11 -1
  10. package/package.json +13 -17
  11. package/README.md +0 -270
  12. package/dist/action.d.ts +0 -8
  13. package/dist/action.js +0 -1212
  14. package/dist/action.js.map +0 -1
  15. package/dist/audio-capture-stream.d.ts +0 -42
  16. package/dist/audio-capture-stream.js +0 -516
  17. package/dist/audio-capture-stream.js.map +0 -1
  18. package/dist/audio-capture.d.ts +0 -25
  19. package/dist/audio-capture.js +0 -412
  20. package/dist/audio-capture.js.map +0 -1
  21. package/dist/basic.test.d.ts +0 -1
  22. package/dist/basic.test.js +0 -97
  23. package/dist/basic.test.js.map +0 -1
  24. package/dist/config.d.ts +0 -73
  25. package/dist/config.js +0 -254
  26. package/dist/config.js.map +0 -1
  27. package/dist/entity-tracker.d.ts +0 -32
  28. package/dist/entity-tracker.js +0 -361
  29. package/dist/entity-tracker.js.map +0 -1
  30. package/dist/errors.d.ts +0 -67
  31. package/dist/errors.js +0 -395
  32. package/dist/errors.js.map +0 -1
  33. package/dist/face-recognition.d.ts +0 -31
  34. package/dist/face-recognition.js +0 -332
  35. package/dist/face-recognition.js.map +0 -1
  36. package/dist/florence2-local.d.ts +0 -25
  37. package/dist/florence2-local.js +0 -280
  38. package/dist/florence2-local.js.map +0 -1
  39. package/dist/florence2-model.d.ts +0 -36
  40. package/dist/florence2-model.js +0 -503
  41. package/dist/florence2-model.js.map +0 -1
  42. package/dist/index.d.ts +0 -3
  43. package/dist/ocr-service-real.d.ts +0 -32
  44. package/dist/ocr-service-real.js +0 -396
  45. package/dist/ocr-service-real.js.map +0 -1
  46. package/dist/ocr-service.d.ts +0 -28
  47. package/dist/ocr-service.js +0 -216
  48. package/dist/ocr-service.js.map +0 -1
  49. package/dist/provider.d.ts +0 -2
  50. package/dist/provider.js +0 -285
  51. package/dist/provider.js.map +0 -1
  52. package/dist/screen-capture.d.ts +0 -16
  53. package/dist/screen-capture.js +0 -302
  54. package/dist/screen-capture.js.map +0 -1
  55. package/dist/service.d.ts +0 -73
  56. package/dist/service.js +0 -1662
  57. package/dist/service.js.map +0 -1
  58. package/dist/tests/e2e/index.d.ts +0 -8
  59. package/dist/tests/e2e/index.js +0 -33
  60. package/dist/tests/e2e/index.js.map +0 -1
  61. package/dist/tests/e2e/run-local.d.ts +0 -2
  62. package/dist/tests/e2e/run-local.js +0 -166
  63. package/dist/tests/e2e/run-local.js.map +0 -1
  64. package/dist/tests/e2e/screen-vision.d.ts +0 -11
  65. package/dist/tests/e2e/screen-vision.js +0 -384
  66. package/dist/tests/e2e/screen-vision.js.map +0 -1
  67. package/dist/tests/e2e/vision-autonomy.d.ts +0 -11
  68. package/dist/tests/e2e/vision-autonomy.js +0 -375
  69. package/dist/tests/e2e/vision-autonomy.js.map +0 -1
  70. package/dist/tests/e2e/vision-basic.d.ts +0 -11
  71. package/dist/tests/e2e/vision-basic.js +0 -434
  72. package/dist/tests/e2e/vision-basic.js.map +0 -1
  73. package/dist/tests/e2e/vision-capture-log.d.ts +0 -11
  74. package/dist/tests/e2e/vision-capture-log.js +0 -302
  75. package/dist/tests/e2e/vision-capture-log.js.map +0 -1
  76. package/dist/tests/e2e/vision-runtime.d.ts +0 -11
  77. package/dist/tests/e2e/vision-runtime.js +0 -357
  78. package/dist/tests/e2e/vision-runtime.js.map +0 -1
  79. package/dist/tests/e2e/vision-worker-tests.d.ts +0 -11
  80. package/dist/tests/e2e/vision-worker-tests.js +0 -466
  81. package/dist/tests/e2e/vision-worker-tests.js.map +0 -1
  82. package/dist/tests/test-pattern-generator.d.ts +0 -40
  83. package/dist/tests/test-pattern-generator.js +0 -191
  84. package/dist/tests/test-pattern-generator.js.map +0 -1
  85. package/dist/tests.d.ts +0 -3
  86. package/dist/tests.js +0 -11
  87. package/dist/tests.js.map +0 -1
  88. package/dist/types.d.ts +0 -222
  89. package/dist/types.js +0 -16
  90. package/dist/types.js.map +0 -1
  91. package/dist/vision-models.d.ts +0 -47
  92. package/dist/vision-models.js +0 -501
  93. package/dist/vision-models.js.map +0 -1
  94. package/dist/vision-worker-manager.d.ts +0 -61
  95. package/dist/vision-worker-manager.js +0 -668
  96. package/dist/vision-worker-manager.js.map +0 -1
  97. package/dist/workers/florence2-worker-simple.d.ts +0 -13
  98. package/dist/workers/florence2-worker-simple.js +0 -121
  99. package/dist/workers/florence2-worker-simple.js.map +0 -1
  100. package/dist/workers/florence2-worker.d.ts +0 -1
  101. package/dist/workers/ocr-worker.d.ts +0 -1
  102. package/dist/workers/screen-capture-worker.d.ts +0 -1
  103. package/dist/workers/worker-logger.d.ts +0 -9
  104. package/dist/workers/worker-logger.js +0 -95
  105. package/dist/workers/worker-logger.js.map +0 -1
package/dist/service.js DELETED
@@ -1,1662 +0,0 @@
1
- "use strict";
2
- var __extends = (this && this.__extends) || (function () {
3
- var extendStatics = function (d, b) {
4
- extendStatics = Object.setPrototypeOf ||
5
- ({ __proto__: [] } instanceof Array && function (d, b) { d.__proto__ = b; }) ||
6
- function (d, b) { for (var p in b) if (Object.prototype.hasOwnProperty.call(b, p)) d[p] = b[p]; };
7
- return extendStatics(d, b);
8
- };
9
- return function (d, b) {
10
- if (typeof b !== "function" && b !== null)
11
- throw new TypeError("Class extends value " + String(b) + " is not a constructor or null");
12
- extendStatics(d, b);
13
- function __() { this.constructor = d; }
14
- d.prototype = b === null ? Object.create(b) : (__.prototype = b.prototype, new __());
15
- };
16
- })();
17
- var __assign = (this && this.__assign) || function () {
18
- __assign = Object.assign || function(t) {
19
- for (var s, i = 1, n = arguments.length; i < n; i++) {
20
- s = arguments[i];
21
- for (var p in s) if (Object.prototype.hasOwnProperty.call(s, p))
22
- t[p] = s[p];
23
- }
24
- return t;
25
- };
26
- return __assign.apply(this, arguments);
27
- };
28
- var __awaiter = (this && this.__awaiter) || function (thisArg, _arguments, P, generator) {
29
- function adopt(value) { return value instanceof P ? value : new P(function (resolve) { resolve(value); }); }
30
- return new (P || (P = Promise))(function (resolve, reject) {
31
- function fulfilled(value) { try { step(generator.next(value)); } catch (e) { reject(e); } }
32
- function rejected(value) { try { step(generator["throw"](value)); } catch (e) { reject(e); } }
33
- function step(result) { result.done ? resolve(result.value) : adopt(result.value).then(fulfilled, rejected); }
34
- step((generator = generator.apply(thisArg, _arguments || [])).next());
35
- });
36
- };
37
- var __generator = (this && this.__generator) || function (thisArg, body) {
38
- var _ = { label: 0, sent: function() { if (t[0] & 1) throw t[1]; return t[1]; }, trys: [], ops: [] }, f, y, t, g = Object.create((typeof Iterator === "function" ? Iterator : Object).prototype);
39
- return g.next = verb(0), g["throw"] = verb(1), g["return"] = verb(2), typeof Symbol === "function" && (g[Symbol.iterator] = function() { return this; }), g;
40
- function verb(n) { return function (v) { return step([n, v]); }; }
41
- function step(op) {
42
- if (f) throw new TypeError("Generator is already executing.");
43
- while (g && (g = 0, op[0] && (_ = 0)), _) try {
44
- if (f = 1, y && (t = op[0] & 2 ? y["return"] : op[0] ? y["throw"] || ((t = y["return"]) && t.call(y), 0) : y.next) && !(t = t.call(y, op[1])).done) return t;
45
- if (y = 0, t) op = [op[0] & 2, t.value];
46
- switch (op[0]) {
47
- case 0: case 1: t = op; break;
48
- case 4: _.label++; return { value: op[1], done: false };
49
- case 5: _.label++; y = op[1]; op = [0]; continue;
50
- case 7: op = _.ops.pop(); _.trys.pop(); continue;
51
- default:
52
- if (!(t = _.trys, t = t.length > 0 && t[t.length - 1]) && (op[0] === 6 || op[0] === 2)) { _ = 0; continue; }
53
- if (op[0] === 3 && (!t || (op[1] > t[0] && op[1] < t[3]))) { _.label = op[1]; break; }
54
- if (op[0] === 6 && _.label < t[1]) { _.label = t[1]; t = op; break; }
55
- if (t && _.label < t[2]) { _.label = t[2]; _.ops.push(op); break; }
56
- if (t[2]) _.ops.pop();
57
- _.trys.pop(); continue;
58
- }
59
- op = body.call(thisArg, _);
60
- } catch (e) { op = [6, e]; y = 0; } finally { f = t = 0; }
61
- if (op[0] & 5) throw op[1]; return { value: op[0] ? op[1] : void 0, done: true };
62
- }
63
- };
64
- var __spreadArray = (this && this.__spreadArray) || function (to, from, pack) {
65
- if (pack || arguments.length === 2) for (var i = 0, l = from.length, ar; i < l; i++) {
66
- if (ar || !(i in from)) {
67
- if (!ar) ar = Array.prototype.slice.call(from, 0, i);
68
- ar[i] = from[i];
69
- }
70
- }
71
- return to.concat(ar || Array.prototype.slice.call(from));
72
- };
73
- Object.defineProperty(exports, "__esModule", { value: true });
74
- exports.VisionService = void 0;
75
- // Vision service for camera integration and scene analysis
76
- var core_1 = require("@elizaos/core");
77
- var child_process_1 = require("child_process");
78
- var fs = require("fs/promises");
79
- var path = require("path");
80
- var sharp_1 = require("sharp");
81
- var util_1 = require("util");
82
- var audio_capture_1 = require("./audio-capture");
83
- var audio_capture_stream_1 = require("./audio-capture-stream");
84
- var entity_tracker_1 = require("./entity-tracker");
85
- var face_recognition_1 = require("./face-recognition");
86
- var florence2_model_1 = require("./florence2-model");
87
- var ocr_service_1 = require("./ocr-service");
88
- var screen_capture_1 = require("./screen-capture");
89
- var types_1 = require("./types");
90
- var vision_models_1 = require("./vision-models");
91
- var vision_worker_manager_1 = require("./vision-worker-manager");
92
- var execAsync = (0, util_1.promisify)(child_process_1.exec);
93
- var VisionService = /** @class */ (function (_super) {
94
- __extends(VisionService, _super);
95
- function VisionService(runtime) {
96
- var _this = _super.call(this, runtime) || this;
97
- _this.capabilityDescription = 'Provides visual perception through camera integration and scene analysis.';
98
- _this.camera = null;
99
- _this.lastFrame = null;
100
- _this.lastSceneDescription = null;
101
- _this.frameProcessingInterval = null;
102
- _this.screenProcessingInterval = null;
103
- _this.isProcessing = false;
104
- _this.isProcessingScreen = false;
105
- _this.audioCapture = null;
106
- _this.streamingAudioCapture = null;
107
- _this.lastScreenCapture = null;
108
- _this.lastEnhancedScene = null;
109
- // Worker manager for high-FPS processing
110
- _this.workerManager = null;
111
- // Add tracking for last update times
112
- _this.lastTfUpdateTime = 0;
113
- _this.lastVlmUpdateTime = 0;
114
- _this.lastTfDescription = '';
115
- // Default configuration
116
- _this.DEFAULT_CONFIG = {
117
- pixelChangeThreshold: 50, // 50% change required for VLM update
118
- updateInterval: 100, // Process frames every 100ms
119
- enablePoseDetection: false,
120
- enableObjectDetection: false,
121
- tfUpdateInterval: 1000, // TensorFlow update every 1 second
122
- vlmUpdateInterval: 10000, // VLM update every 10 seconds
123
- tfChangeThreshold: 10, // 10% change triggers TF update
124
- vlmChangeThreshold: 50, // 50% change triggers VLM update
125
- visionMode: types_1.VisionMode.CAMERA, // Default to camera only
126
- screenCaptureInterval: 2000, // Screen capture every 2 seconds
127
- tileSize: 256,
128
- tileProcessingOrder: 'priority',
129
- ocrEnabled: true,
130
- florence2Enabled: true,
131
- };
132
- // Load configuration from runtime settings
133
- _this.visionConfig = _this.parseConfig(runtime);
134
- // Initialize vision models
135
- _this.visionModels = new vision_models_1.VisionModels(runtime);
136
- // Initialize face recognition
137
- _this.faceRecognition = new face_recognition_1.FaceRecognition();
138
- // Initialize entity tracker
139
- var worldId = runtime.getSetting('WORLD_ID') || 'default-world';
140
- _this.entityTracker = new entity_tracker_1.EntityTracker(worldId);
141
- // Initialize screen capture
142
- _this.screenCapture = new screen_capture_1.ScreenCaptureService(_this.visionConfig);
143
- // Initialize Florence-2
144
- _this.florence2 = new florence2_model_1.Florence2Model();
145
- // Initialize OCR service
146
- _this.ocrService = new ocr_service_1.OCRService();
147
- core_1.logger.info('[VisionService] Constructed with config:', _this.visionConfig);
148
- return _this;
149
- }
150
- VisionService.prototype.parseConfig = function (runtime) {
151
- return __assign(__assign({}, this.DEFAULT_CONFIG), { cameraName: runtime.getSetting('CAMERA_NAME') || runtime.getSetting('VISION_CAMERA_NAME'), pixelChangeThreshold: Number(runtime.getSetting('PIXEL_CHANGE_THRESHOLD') ||
152
- runtime.getSetting('VISION_PIXEL_CHANGE_THRESHOLD')) || this.DEFAULT_CONFIG.pixelChangeThreshold, enableObjectDetection: runtime.getSetting('ENABLE_OBJECT_DETECTION') === 'true' ||
153
- runtime.getSetting('VISION_ENABLE_OBJECT_DETECTION') === 'true', enablePoseDetection: runtime.getSetting('ENABLE_POSE_DETECTION') === 'true' ||
154
- runtime.getSetting('VISION_ENABLE_POSE_DETECTION') === 'true', tfUpdateInterval: Number(runtime.getSetting('TF_UPDATE_INTERVAL') ||
155
- runtime.getSetting('VISION_TF_UPDATE_INTERVAL')) || this.DEFAULT_CONFIG.tfUpdateInterval, vlmUpdateInterval: Number(runtime.getSetting('VLM_UPDATE_INTERVAL') ||
156
- runtime.getSetting('VISION_VLM_UPDATE_INTERVAL')) || this.DEFAULT_CONFIG.vlmUpdateInterval, tfChangeThreshold: Number(runtime.getSetting('TF_CHANGE_THRESHOLD') ||
157
- runtime.getSetting('VISION_TF_CHANGE_THRESHOLD')) || this.DEFAULT_CONFIG.tfChangeThreshold, vlmChangeThreshold: Number(runtime.getSetting('VLM_CHANGE_THRESHOLD') ||
158
- runtime.getSetting('VISION_VLM_CHANGE_THRESHOLD')) || this.DEFAULT_CONFIG.vlmChangeThreshold, visionMode: runtime.getSetting('VISION_MODE') || this.DEFAULT_CONFIG.visionMode, screenCaptureInterval: Number(runtime.getSetting('SCREEN_CAPTURE_INTERVAL') ||
159
- runtime.getSetting('VISION_SCREEN_CAPTURE_INTERVAL')) || this.DEFAULT_CONFIG.screenCaptureInterval, ocrEnabled: runtime.getSetting('OCR_ENABLED') === 'true' ||
160
- runtime.getSetting('VISION_OCR_ENABLED') === 'true', florence2Enabled: runtime.getSetting('FLORENCE2_ENABLED') === 'true' ||
161
- runtime.getSetting('VISION_FLORENCE2_ENABLED') === 'true' });
162
- };
163
- VisionService.start = function (runtime) {
164
- return __awaiter(this, void 0, void 0, function () {
165
- var service;
166
- return __generator(this, function (_a) {
167
- switch (_a.label) {
168
- case 0:
169
- service = new VisionService(runtime);
170
- return [4 /*yield*/, service.initialize()];
171
- case 1:
172
- _a.sent();
173
- return [2 /*return*/, service];
174
- }
175
- });
176
- });
177
- };
178
- VisionService.prototype.checkCameraTools = function () {
179
- return __awaiter(this, void 0, void 0, function () {
180
- var platform, _error_1;
181
- return __generator(this, function (_a) {
182
- switch (_a.label) {
183
- case 0:
184
- platform = process.platform;
185
- _a.label = 1;
186
- case 1:
187
- _a.trys.push([1, 8, , 9]);
188
- if (!(platform === 'darwin')) return [3 /*break*/, 3];
189
- // Check if imagesnap is installed
190
- return [4 /*yield*/, execAsync('which imagesnap')];
191
- case 2:
192
- // Check if imagesnap is installed
193
- _a.sent();
194
- return [2 /*return*/, { available: true, tool: 'imagesnap' }];
195
- case 3:
196
- if (!(platform === 'linux')) return [3 /*break*/, 5];
197
- // Check if fswebcam is installed
198
- return [4 /*yield*/, execAsync('which fswebcam')];
199
- case 4:
200
- // Check if fswebcam is installed
201
- _a.sent();
202
- return [2 /*return*/, { available: true, tool: 'fswebcam' }];
203
- case 5:
204
- if (!(platform === 'win32')) return [3 /*break*/, 7];
205
- // Check if ffmpeg is available
206
- return [4 /*yield*/, execAsync('where ffmpeg')];
207
- case 6:
208
- // Check if ffmpeg is available
209
- _a.sent();
210
- return [2 /*return*/, { available: true, tool: 'ffmpeg' }];
211
- case 7: return [2 /*return*/, { available: false, tool: 'none' }];
212
- case 8:
213
- _error_1 = _a.sent();
214
- // Tool not found
215
- return [2 /*return*/, { available: false, tool: 'none' }];
216
- case 9: return [2 /*return*/];
217
- }
218
- });
219
- });
220
- };
221
- VisionService.prototype.initialize = function () {
222
- return __awaiter(this, void 0, void 0, function () {
223
- var useEnhancedModels, _tfError_1, error_1;
224
- return __generator(this, function (_a) {
225
- switch (_a.label) {
226
- case 0:
227
- _a.trys.push([0, 11, , 12]);
228
- useEnhancedModels = this.visionConfig.enableObjectDetection || this.visionConfig.enablePoseDetection;
229
- if (!useEnhancedModels) return [3 /*break*/, 5];
230
- _a.label = 1;
231
- case 1:
232
- _a.trys.push([1, 3, , 5]);
233
- // Try to initialize TensorFlow models first
234
- return [4 /*yield*/, this.visionModels.initialize({
235
- enableObjectDetection: this.visionConfig.enableObjectDetection || false,
236
- enablePoseDetection: this.visionConfig.enablePoseDetection || false,
237
- })];
238
- case 2:
239
- // Try to initialize TensorFlow models first
240
- _a.sent();
241
- core_1.logger.info('[VisionService] Using TensorFlow.js models for advanced detection');
242
- return [3 /*break*/, 5];
243
- case 3:
244
- _tfError_1 = _a.sent();
245
- core_1.logger.warn('[VisionService] TensorFlow.js not available, falling back to enhanced heuristics');
246
- // Fall back to enhanced heuristics
247
- return [4 /*yield*/, this.visionModels.initialize({
248
- enableObjectDetection: this.visionConfig.enableObjectDetection || false,
249
- enablePoseDetection: this.visionConfig.enablePoseDetection || false,
250
- })];
251
- case 4:
252
- // Fall back to enhanced heuristics
253
- _a.sent();
254
- core_1.logger.info('[VisionService] Using enhanced heuristics for detection');
255
- return [3 /*break*/, 5];
256
- case 5:
257
- if (!(this.visionConfig.visionMode === types_1.VisionMode.SCREEN ||
258
- this.visionConfig.visionMode === types_1.VisionMode.BOTH)) return [3 /*break*/, 7];
259
- return [4 /*yield*/, this.initializeScreenVision()];
260
- case 6:
261
- _a.sent();
262
- _a.label = 7;
263
- case 7:
264
- if (!(this.visionConfig.visionMode === types_1.VisionMode.CAMERA ||
265
- this.visionConfig.visionMode === types_1.VisionMode.BOTH)) return [3 /*break*/, 9];
266
- return [4 /*yield*/, this.initializeCameraVision()];
267
- case 8:
268
- _a.sent();
269
- _a.label = 9;
270
- case 9:
271
- // Initialize audio capture if enabled
272
- return [4 /*yield*/, this.initializeAudioCapture()];
273
- case 10:
274
- // Initialize audio capture if enabled
275
- _a.sent();
276
- // Start processing based on mode
277
- this.startProcessing();
278
- return [3 /*break*/, 12];
279
- case 11:
280
- error_1 = _a.sent();
281
- core_1.logger.error('[VisionService] Failed to initialize:', error_1);
282
- return [3 /*break*/, 12];
283
- case 12: return [2 /*return*/];
284
- }
285
- });
286
- });
287
- };
288
- VisionService.prototype.initializeScreenVision = function () {
289
- return __awaiter(this, void 0, void 0, function () {
290
- var useWorkers, screenInfo, error_2;
291
- return __generator(this, function (_a) {
292
- switch (_a.label) {
293
- case 0:
294
- _a.trys.push([0, 8, , 9]);
295
- core_1.logger.info('[VisionService] Initializing screen vision...');
296
- useWorkers = this.visionConfig.targetScreenFPS && this.visionConfig.targetScreenFPS > 10;
297
- if (!useWorkers) return [3 /*break*/, 2];
298
- // Initialize worker manager for high-FPS processing
299
- core_1.logger.info('[VisionService] Initializing worker threads for high-FPS processing...');
300
- this.workerManager = new vision_worker_manager_1.VisionWorkerManager(this.visionConfig);
301
- return [4 /*yield*/, this.workerManager.initialize()];
302
- case 1:
303
- _a.sent();
304
- core_1.logger.info('[VisionService] Worker threads initialized');
305
- return [3 /*break*/, 6];
306
- case 2:
307
- if (!this.visionConfig.florence2Enabled) return [3 /*break*/, 4];
308
- return [4 /*yield*/, this.florence2.initialize()];
309
- case 3:
310
- _a.sent();
311
- _a.label = 4;
312
- case 4:
313
- if (!this.visionConfig.ocrEnabled) return [3 /*break*/, 6];
314
- return [4 /*yield*/, this.ocrService.initialize()];
315
- case 5:
316
- _a.sent();
317
- _a.label = 6;
318
- case 6: return [4 /*yield*/, this.screenCapture.getScreenInfo()];
319
- case 7:
320
- screenInfo = _a.sent();
321
- if (screenInfo) {
322
- core_1.logger.info("[VisionService] Screen resolution: ".concat(screenInfo.width, "x").concat(screenInfo.height));
323
- }
324
- core_1.logger.info('[VisionService] Screen vision initialized');
325
- return [3 /*break*/, 9];
326
- case 8:
327
- error_2 = _a.sent();
328
- core_1.logger.error('[VisionService] Failed to initialize screen vision:', error_2);
329
- return [3 /*break*/, 9];
330
- case 9: return [2 /*return*/];
331
- }
332
- });
333
- });
334
- };
335
- VisionService.prototype.initializeCameraVision = function () {
336
- return __awaiter(this, void 0, void 0, function () {
337
- var toolCheck, platform, toolName, camera;
338
- return __generator(this, function (_a) {
339
- switch (_a.label) {
340
- case 0: return [4 /*yield*/, this.checkCameraTools()];
341
- case 1:
342
- toolCheck = _a.sent();
343
- if (!toolCheck.available) {
344
- platform = process.platform;
345
- toolName = platform === 'darwin' ? 'imagesnap' : platform === 'linux' ? 'fswebcam' : 'ffmpeg';
346
- core_1.logger.warn("[VisionService] Camera capture tool '".concat(toolName, "' not found. Install it to enable camera functionality."));
347
- core_1.logger.warn('[VisionService] For macOS: brew install imagesnap');
348
- core_1.logger.warn('[VisionService] For Linux: sudo apt-get install fswebcam');
349
- core_1.logger.warn('[VisionService] For Windows: Install ffmpeg and add to PATH');
350
- return [2 /*return*/];
351
- }
352
- return [4 /*yield*/, this.findCamera()];
353
- case 2:
354
- camera = _a.sent();
355
- if (camera) {
356
- this.camera = camera;
357
- core_1.logger.info("[VisionService] Connected to camera: ".concat(camera.name));
358
- }
359
- else {
360
- core_1.logger.warn('[VisionService] No suitable camera found');
361
- }
362
- return [2 /*return*/];
363
- }
364
- });
365
- });
366
- };
367
- VisionService.prototype.initializeAudioCapture = function () {
368
- return __awaiter(this, void 0, void 0, function () {
369
- var enableMicrophone, useStreamingAudio, streamingConfig, audioConfig, error_3;
370
- var _this = this;
371
- return __generator(this, function (_a) {
372
- switch (_a.label) {
373
- case 0:
374
- enableMicrophone = this.runtime.getSetting('ENABLE_MICROPHONE') === 'true';
375
- useStreamingAudio = this.runtime.getSetting('USE_STREAMING_AUDIO') === 'true';
376
- if (!enableMicrophone) {
377
- core_1.logger.info('[VisionService] Microphone capture disabled');
378
- return [2 /*return*/];
379
- }
380
- _a.label = 1;
381
- case 1:
382
- _a.trys.push([1, 6, , 7]);
383
- if (!useStreamingAudio) return [3 /*break*/, 3];
384
- streamingConfig = {
385
- enabled: true,
386
- sampleRate: 16000,
387
- channels: 1,
388
- vadThreshold: Number(this.runtime.getSetting('VAD_THRESHOLD')) || 0.01,
389
- silenceTimeout: Number(this.runtime.getSetting('SILENCE_TIMEOUT')) || 1500,
390
- responseDelay: Number(this.runtime.getSetting('RESPONSE_DELAY')) || 3000,
391
- };
392
- this.streamingAudioCapture = new audio_capture_stream_1.StreamingAudioCaptureService(this.runtime, streamingConfig);
393
- // Set up event listeners
394
- this.streamingAudioCapture.on('speechStart', function () {
395
- core_1.logger.info('[VisionService] User started speaking');
396
- });
397
- this.streamingAudioCapture.on('speechEnd', function () {
398
- core_1.logger.info('[VisionService] User stopped speaking');
399
- });
400
- this.streamingAudioCapture.on('transcription', function (data) {
401
- core_1.logger.info("[VisionService] Transcription (".concat(data.isFinal ? 'final' : 'partial', "): ").concat(data.text));
402
- });
403
- this.streamingAudioCapture.on('utteranceComplete', function (text) { return __awaiter(_this, void 0, void 0, function () {
404
- return __generator(this, function (_a) {
405
- switch (_a.label) {
406
- case 0:
407
- core_1.logger.info('[VisionService] Processing complete utterance:', text);
408
- // Store the transcription in memory for context
409
- return [4 /*yield*/, this.storeAudioTranscription(text)];
410
- case 1:
411
- // Store the transcription in memory for context
412
- _a.sent();
413
- return [2 /*return*/];
414
- }
415
- });
416
- }); });
417
- return [4 /*yield*/, this.streamingAudioCapture.initialize()];
418
- case 2:
419
- _a.sent();
420
- core_1.logger.info('[VisionService] Streaming audio capture initialized with VAD');
421
- return [3 /*break*/, 5];
422
- case 3:
423
- audioConfig = {
424
- enabled: true,
425
- transcriptionInterval: Number(this.runtime.getSetting('TRANSCRIPTION_INTERVAL')) || 30000,
426
- };
427
- this.audioCapture = new audio_capture_1.AudioCaptureService(this.runtime, audioConfig);
428
- return [4 /*yield*/, this.audioCapture.initialize()];
429
- case 4:
430
- _a.sent();
431
- core_1.logger.info('[VisionService] Batch audio capture initialized');
432
- _a.label = 5;
433
- case 5: return [3 /*break*/, 7];
434
- case 6:
435
- error_3 = _a.sent();
436
- core_1.logger.error('[VisionService] Failed to initialize audio capture:', error_3);
437
- return [3 /*break*/, 7];
438
- case 7: return [2 /*return*/];
439
- }
440
- });
441
- });
442
- };
443
- VisionService.prototype.storeAudioTranscription = function (text) {
444
- return __awaiter(this, void 0, void 0, function () {
445
- return __generator(this, function (_a) {
446
- try {
447
- // Store transcription in the current scene description
448
- if (this.lastSceneDescription) {
449
- this.lastSceneDescription.audioTranscription = text;
450
- }
451
- // You could also create a memory here if needed
452
- core_1.logger.debug('[VisionService] Stored audio transcription in scene context');
453
- }
454
- catch (error) {
455
- core_1.logger.error('[VisionService] Failed to store audio transcription:', error);
456
- }
457
- return [2 /*return*/];
458
- });
459
- });
460
- };
461
- VisionService.prototype.startProcessing = function () {
462
- // Start camera processing if enabled
463
- if ((this.visionConfig.visionMode === types_1.VisionMode.CAMERA ||
464
- this.visionConfig.visionMode === types_1.VisionMode.BOTH) &&
465
- this.camera) {
466
- this.startFrameProcessing();
467
- }
468
- // Start screen processing if enabled
469
- if (this.visionConfig.visionMode === types_1.VisionMode.SCREEN ||
470
- this.visionConfig.visionMode === types_1.VisionMode.BOTH) {
471
- this.startScreenProcessing();
472
- }
473
- };
474
- VisionService.prototype.startFrameProcessing = function () {
475
- var _this = this;
476
- if (this.frameProcessingInterval) {
477
- return;
478
- }
479
- this.frameProcessingInterval = setInterval(function () { return __awaiter(_this, void 0, void 0, function () {
480
- var error_4;
481
- return __generator(this, function (_a) {
482
- switch (_a.label) {
483
- case 0:
484
- if (!(!this.isProcessing && this.camera)) return [3 /*break*/, 5];
485
- this.isProcessing = true;
486
- _a.label = 1;
487
- case 1:
488
- _a.trys.push([1, 3, , 4]);
489
- return [4 /*yield*/, this.captureAndProcessFrame()];
490
- case 2:
491
- _a.sent();
492
- return [3 /*break*/, 4];
493
- case 3:
494
- error_4 = _a.sent();
495
- core_1.logger.error('[VisionService] Frame processing error:', error_4);
496
- return [3 /*break*/, 4];
497
- case 4:
498
- this.isProcessing = false;
499
- _a.label = 5;
500
- case 5: return [2 /*return*/];
501
- }
502
- });
503
- }); }, this.visionConfig.updateInterval || 100);
504
- core_1.logger.debug('[VisionService] Started frame processing loop');
505
- };
506
- VisionService.prototype.captureAndProcessFrame = function () {
507
- return __awaiter(this, void 0, void 0, function () {
508
- var frameData, frame, changePercentage, _a, error_5;
509
- return __generator(this, function (_b) {
510
- switch (_b.label) {
511
- case 0:
512
- if (!this.camera) {
513
- return [2 /*return*/];
514
- }
515
- _b.label = 1;
516
- case 1:
517
- _b.trys.push([1, 8, , 9]);
518
- return [4 /*yield*/, this.camera.capture()];
519
- case 2:
520
- frameData = _b.sent();
521
- // Skip if no data
522
- if (!frameData || frameData.length === 0) {
523
- core_1.logger.debug('[VisionService] Camera returned empty frame, skipping');
524
- return [2 /*return*/];
525
- }
526
- return [4 /*yield*/, this.processFrameData(frameData)];
527
- case 3:
528
- frame = _b.sent();
529
- // Validate frame before processing
530
- if (!frame || frame.width === 0 || frame.height === 0) {
531
- core_1.logger.warn('[VisionService] Invalid frame dimensions, skipping');
532
- return [2 /*return*/];
533
- }
534
- if (!this.lastFrame) return [3 /*break*/, 5];
535
- return [4 /*yield*/, this.calculatePixelChange(this.lastFrame, frame)];
536
- case 4:
537
- _a = _b.sent();
538
- return [3 /*break*/, 6];
539
- case 5:
540
- _a = 100;
541
- _b.label = 6;
542
- case 6:
543
- changePercentage = _a;
544
- // Update scene description if change is significant or enough time has passed
545
- // Always call updateSceneDescription - it will decide what to update based on thresholds
546
- return [4 /*yield*/, this.updateSceneDescription(frame, changePercentage)];
547
- case 7:
548
- // Update scene description if change is significant or enough time has passed
549
- // Always call updateSceneDescription - it will decide what to update based on thresholds
550
- _b.sent();
551
- this.lastFrame = frame;
552
- return [3 /*break*/, 9];
553
- case 8:
554
- error_5 = _b.sent();
555
- core_1.logger.error('[VisionService] Error capturing frame:', error_5);
556
- return [3 /*break*/, 9];
557
- case 9: return [2 /*return*/];
558
- }
559
- });
560
- });
561
- };
562
- VisionService.prototype.processFrameData = function (data) {
563
- return __awaiter(this, void 0, void 0, function () {
564
- var image, metadata, rgbaBuffer;
565
- return __generator(this, function (_a) {
566
- switch (_a.label) {
567
- case 0:
568
- // Validate input data
569
- if (!data || data.length === 0) {
570
- throw new Error('Empty frame data received from camera');
571
- }
572
- image = (0, sharp_1.default)(data);
573
- return [4 /*yield*/, image.metadata()];
574
- case 1:
575
- metadata = _a.sent();
576
- // Validate metadata
577
- if (!metadata.width || !metadata.height || metadata.width === 0 || metadata.height === 0) {
578
- throw new Error("Invalid image dimensions: ".concat(metadata.width, "x").concat(metadata.height));
579
- }
580
- return [4 /*yield*/, image.ensureAlpha().raw().toBuffer()];
581
- case 2:
582
- rgbaBuffer = _a.sent();
583
- return [2 /*return*/, {
584
- timestamp: Date.now(),
585
- width: metadata.width,
586
- height: metadata.height,
587
- data: rgbaBuffer,
588
- format: 'rgba',
589
- }];
590
- }
591
- });
592
- });
593
- };
594
- VisionService.prototype.calculatePixelChange = function (frame1, frame2) {
595
- return __awaiter(this, void 0, void 0, function () {
596
- var pixels1, pixels2, changedPixels, totalPixels, threshold, i, r1, g1, b1, r2, g2, b2, diff;
597
- return __generator(this, function (_a) {
598
- if (frame1.width !== frame2.width || frame1.height !== frame2.height) {
599
- return [2 /*return*/, 100]; // Different dimensions = complete change
600
- }
601
- pixels1 = frame1.data;
602
- pixels2 = frame2.data;
603
- changedPixels = 0;
604
- totalPixels = frame1.width * frame1.height;
605
- threshold = 30;
606
- for (i = 0; i < pixels1.length; i += 4) {
607
- r1 = pixels1[i];
608
- g1 = pixels1[i + 1];
609
- b1 = pixels1[i + 2];
610
- r2 = pixels2[i];
611
- g2 = pixels2[i + 1];
612
- b2 = pixels2[i + 2];
613
- diff = Math.abs(r1 - r2) + Math.abs(g1 - g2) + Math.abs(b1 - b2);
614
- if (diff > threshold) {
615
- changedPixels++;
616
- }
617
- }
618
- return [2 /*return*/, (changedPixels / totalPixels) * 100];
619
- });
620
- });
621
- };
622
- VisionService.prototype.updateSceneDescription = function (frame, changePercentage) {
623
- return __awaiter(this, void 0, void 0, function () {
624
- var currentTime, jpegBuffer, base64Image, imageUrl, timeSinceVlmUpdate, shouldUpdateVlm, description, timeSinceTfUpdate, shouldUpdateTf, detectedObjects, people, poses, personObjects, faceProfiles, enableFaceRecognition, faces, _i, faces_1, face, faceBox, _a, people_1, person, overlap, match, profileId, faceError_1, _trackedEntities, objectSummary, _b, _c, _d, type, count, _e, people_2, person, error_6;
625
- var _f, _g;
626
- return __generator(this, function (_h) {
627
- switch (_h.label) {
628
- case 0:
629
- _h.trys.push([0, 27, , 28]);
630
- currentTime = Date.now();
631
- return [4 /*yield*/, (0, sharp_1.default)(frame.data, {
632
- raw: {
633
- width: frame.width,
634
- height: frame.height,
635
- channels: 4,
636
- },
637
- })
638
- .jpeg()
639
- .toBuffer()];
640
- case 1:
641
- jpegBuffer = _h.sent();
642
- base64Image = jpegBuffer.toString('base64');
643
- imageUrl = "data:image/jpeg;base64,".concat(base64Image);
644
- timeSinceVlmUpdate = currentTime - this.lastVlmUpdateTime;
645
- shouldUpdateVlm = timeSinceVlmUpdate >= this.visionConfig.vlmUpdateInterval || // Time threshold
646
- changePercentage >= this.visionConfig.vlmChangeThreshold;
647
- description = this.lastTfDescription;
648
- if (!shouldUpdateVlm) return [3 /*break*/, 3];
649
- return [4 /*yield*/, this.describeSceneWithVLM(imageUrl)];
650
- case 2:
651
- // Use VLM to describe the scene
652
- description = _h.sent();
653
- this.lastVlmUpdateTime = currentTime;
654
- this.lastTfDescription = description;
655
- core_1.logger.debug("[VisionService] VLM updated: ".concat(timeSinceVlmUpdate, "ms since last update, ").concat(changePercentage.toFixed(1), "% change"));
656
- _h.label = 3;
657
- case 3:
658
- timeSinceTfUpdate = currentTime - this.lastTfUpdateTime;
659
- shouldUpdateTf = timeSinceTfUpdate >= this.visionConfig.tfUpdateInterval || // Time threshold
660
- changePercentage >= this.visionConfig.tfChangeThreshold;
661
- detectedObjects = [];
662
- people = [];
663
- if (!(shouldUpdateTf &&
664
- (this.visionConfig.enableObjectDetection || this.visionConfig.enablePoseDetection))) return [3 /*break*/, 8];
665
- this.lastTfUpdateTime = currentTime;
666
- core_1.logger.debug("[VisionService] TF updating: ".concat(timeSinceTfUpdate, "ms since last update, ").concat(changePercentage.toFixed(1), "% change"));
667
- if (!this.visionConfig.enableObjectDetection) return [3 /*break*/, 5];
668
- if (!this.visionModels.hasObjectDetection()) return [3 /*break*/, 5];
669
- return [4 /*yield*/, this.visionModels.detectObjects(frame.data, frame.width, frame.height)];
670
- case 4:
671
- detectedObjects = _h.sent();
672
- core_1.logger.debug("[VisionService] VisionModels detected ".concat(detectedObjects.length, " objects"));
673
- _h.label = 5;
674
- case 5:
675
- if (!this.visionConfig.enablePoseDetection) return [3 /*break*/, 7];
676
- if (!this.visionModels.hasPoseDetection()) return [3 /*break*/, 7];
677
- return [4 /*yield*/, this.visionModels.detectPoses(frame.data, frame.width, frame.height)];
678
- case 6:
679
- poses = _h.sent();
680
- people = poses;
681
- core_1.logger.debug("[VisionService] VisionModels detected ".concat(people.length, " people with poses"));
682
- _h.label = 7;
683
- case 7:
684
- // If no people detected via pose but objects detected, check for person objects
685
- if (people.length === 0 && detectedObjects.length > 0) {
686
- personObjects = detectedObjects.filter(function (obj) { return obj.type === 'person'; });
687
- people = personObjects.map(function (obj) { return ({
688
- id: "person-".concat(obj.id),
689
- pose: 'unknown',
690
- facing: 'unknown',
691
- confidence: obj.confidence,
692
- boundingBox: obj.boundingBox,
693
- }); });
694
- }
695
- return [3 /*break*/, 12];
696
- case 8:
697
- if (!(!shouldUpdateTf && this.lastSceneDescription)) return [3 /*break*/, 9];
698
- // Reuse last detection results if not updating
699
- detectedObjects = this.lastSceneDescription.objects;
700
- people = this.lastSceneDescription.people;
701
- return [3 /*break*/, 12];
702
- case 9: return [4 /*yield*/, this.detectMotionObjects(frame)];
703
- case 10:
704
- // Fall back to motion-based detection
705
- detectedObjects = _h.sent();
706
- return [4 /*yield*/, this.detectPeopleFromMotion(frame, detectedObjects)];
707
- case 11:
708
- people = _h.sent();
709
- _h.label = 12;
710
- case 12:
711
- faceProfiles = new Map();
712
- enableFaceRecognition = this.runtime.getSetting('ENABLE_FACE_RECOGNITION') === 'true';
713
- if (!(enableFaceRecognition && people.length > 0 && frame.width > 0 && frame.height > 0)) return [3 /*break*/, 25];
714
- _h.label = 13;
715
- case 13:
716
- _h.trys.push([13, 24, , 25]);
717
- // Validate frame data
718
- if (!frame.data || frame.data.length === 0) {
719
- core_1.logger.warn('[VisionService] Invalid frame data for face recognition');
720
- return [2 /*return*/];
721
- }
722
- return [4 /*yield*/, this.faceRecognition.detectFaces(frame.data, frame.width, frame.height)];
723
- case 14:
724
- faces = _h.sent();
725
- _i = 0, faces_1 = faces;
726
- _h.label = 15;
727
- case 15:
728
- if (!(_i < faces_1.length)) return [3 /*break*/, 23];
729
- face = faces_1[_i];
730
- faceBox = face.detection.box;
731
- _a = 0, people_1 = people;
732
- _h.label = 16;
733
- case 16:
734
- if (!(_a < people_1.length)) return [3 /*break*/, 22];
735
- person = people_1[_a];
736
- overlap = this.calculateBoxOverlap(person.boundingBox, {
737
- x: Math.round(faceBox.x),
738
- y: Math.round(faceBox.y),
739
- width: Math.round(faceBox.width),
740
- height: Math.round(faceBox.height),
741
- });
742
- if (!(overlap > 0.5)) return [3 /*break*/, 21];
743
- return [4 /*yield*/, this.faceRecognition.recognizeFace(face.descriptor)];
744
- case 17:
745
- match = _h.sent();
746
- profileId = void 0;
747
- if (!match) return [3 /*break*/, 18];
748
- profileId = match.profileId;
749
- core_1.logger.debug("[VisionService] Recognized face: ".concat(profileId, " (distance: ").concat(match.distance, ")"));
750
- return [3 /*break*/, 20];
751
- case 18: return [4 /*yield*/, this.faceRecognition.addOrUpdateFace(face.descriptor, {
752
- attributes: {
753
- age: (_f = face.ageGender) === null || _f === void 0 ? void 0 : _f.age.toString(),
754
- gender: (_g = face.ageGender) === null || _g === void 0 ? void 0 : _g.gender,
755
- emotion: face.expressions
756
- ? this.getDominantExpression(face.expressions)
757
- : undefined,
758
- },
759
- })];
760
- case 19:
761
- // Register new face
762
- profileId = _h.sent();
763
- core_1.logger.info("[VisionService] New face registered: ".concat(profileId));
764
- _h.label = 20;
765
- case 20:
766
- faceProfiles.set(person.id, profileId);
767
- return [3 /*break*/, 22];
768
- case 21:
769
- _a++;
770
- return [3 /*break*/, 16];
771
- case 22:
772
- _i++;
773
- return [3 /*break*/, 15];
774
- case 23: return [3 /*break*/, 25];
775
- case 24:
776
- faceError_1 = _h.sent();
777
- core_1.logger.error('[VisionService] Face recognition error:', faceError_1);
778
- return [3 /*break*/, 25];
779
- case 25: return [4 /*yield*/, this.entityTracker.updateEntities(detectedObjects, people, faceProfiles, this.runtime)];
780
- case 26:
781
- _trackedEntities = _h.sent();
782
- // Create scene description
783
- this.lastSceneDescription = {
784
- timestamp: frame.timestamp,
785
- description: description,
786
- objects: detectedObjects,
787
- people: people,
788
- sceneChanged: shouldUpdateVlm || shouldUpdateTf,
789
- changePercentage: changePercentage,
790
- };
791
- // Enhanced logging
792
- if (shouldUpdateVlm || shouldUpdateTf) {
793
- core_1.logger.info('[VisionService] Scene Analysis Complete:');
794
- core_1.logger.info(" VLM Description: ".concat(description.substring(0, 100), "..."));
795
- core_1.logger.info(" Change: ".concat(changePercentage.toFixed(1), "%"));
796
- core_1.logger.info(" Updates: ".concat(shouldUpdateVlm ? 'VLM' : '').concat(shouldUpdateVlm && shouldUpdateTf ? ' + ' : '').concat(shouldUpdateTf ? 'TF' : ''));
797
- core_1.logger.info(" Detection Mode: ".concat(this.visionConfig.enableObjectDetection ? 'Advanced CV' : 'Motion-based'));
798
- if (detectedObjects.length > 0) {
799
- core_1.logger.info(" Objects: ".concat(detectedObjects.length, " detected"));
800
- objectSummary = detectedObjects.reduce(function (acc, obj) {
801
- acc[obj.type] = (acc[obj.type] || 0) + 1;
802
- return acc;
803
- }, {});
804
- for (_b = 0, _c = Object.entries(objectSummary); _b < _c.length; _b++) {
805
- _d = _c[_b], type = _d[0], count = _d[1];
806
- core_1.logger.info(" - ".concat(count, " ").concat(type, "(s)"));
807
- }
808
- }
809
- if (people.length > 0) {
810
- core_1.logger.info(" People: ".concat(people.length, " detected"));
811
- for (_e = 0, people_2 = people; _e < people_2.length; _e++) {
812
- person = people_2[_e];
813
- core_1.logger.info(" - Person: ".concat(person.pose, " pose, facing ").concat(person.facing, ", confidence: ").concat(person.confidence.toFixed(2)));
814
- }
815
- }
816
- }
817
- return [3 /*break*/, 28];
818
- case 27:
819
- error_6 = _h.sent();
820
- core_1.logger.error('[VisionService] Failed to update scene description:', error_6);
821
- return [3 /*break*/, 28];
822
- case 28: return [2 /*return*/];
823
- }
824
- });
825
- });
826
- };
827
- VisionService.prototype.describeSceneWithVLM = function (imageUrl) {
828
- return __awaiter(this, void 0, void 0, function () {
829
- var base64Data, imageBuffer, result, florenceError_1, result, description, stringResult, modelError_1, _a, objects, people, description, poses, objectTypes, error_7;
830
- return __generator(this, function (_b) {
831
- switch (_b.label) {
832
- case 0:
833
- _b.trys.push([0, 8, , 9]);
834
- if (!imageUrl.startsWith('data:image/')) return [3 /*break*/, 4];
835
- base64Data = imageUrl.split(',')[1];
836
- imageBuffer = Buffer.from(base64Data, 'base64');
837
- if (!this.florence2.isInitialized()) return [3 /*break*/, 4];
838
- _b.label = 1;
839
- case 1:
840
- _b.trys.push([1, 3, , 4]);
841
- return [4 /*yield*/, this.florence2.analyzeImage(imageBuffer)];
842
- case 2:
843
- result = _b.sent();
844
- if (result.caption) {
845
- core_1.logger.debug('[VisionService] Florence-2 description:', result.caption);
846
- return [2 /*return*/, result.caption];
847
- }
848
- return [3 /*break*/, 4];
849
- case 3:
850
- florenceError_1 = _b.sent();
851
- core_1.logger.warn('[VisionService] Florence-2 analysis failed, falling back:', florenceError_1);
852
- return [3 /*break*/, 4];
853
- case 4:
854
- _b.trys.push([4, 6, , 7]);
855
- return [4 /*yield*/, this.runtime.useModel(core_1.ModelType.IMAGE_DESCRIPTION, imageUrl)];
856
- case 5:
857
- result = _b.sent();
858
- if (result && typeof result === 'object' && 'description' in result) {
859
- description = result.description;
860
- // Check if we got the unhelpful default response
861
- if (!description.includes("I'm unable to analyze images") &&
862
- !description.includes("I can't analyze images")) {
863
- return [2 /*return*/, description];
864
- }
865
- }
866
- else if (typeof result === 'string') {
867
- stringResult = result;
868
- if (stringResult.length > 0 &&
869
- !stringResult.includes("I'm unable to analyze images") &&
870
- !stringResult.includes("I can't analyze images")) {
871
- return [2 /*return*/, stringResult];
872
- }
873
- }
874
- return [3 /*break*/, 7];
875
- case 6:
876
- modelError_1 = _b.sent();
877
- core_1.logger.warn('[VisionService] Runtime IMAGE_DESCRIPTION model failed:', modelError_1);
878
- return [3 /*break*/, 7];
879
- case 7:
880
- // If we got the unhelpful response or an error, provide a basic description based on detected objects
881
- if (this.lastSceneDescription) {
882
- _a = this.lastSceneDescription, objects = _a.objects, people = _a.people;
883
- description = 'Scene contains';
884
- if (people.length > 0) {
885
- description += " ".concat(people.length, " person").concat(people.length > 1 ? 's' : '');
886
- poses = people.map(function (p) { return p.pose; }).filter(function (p) { return p !== 'unknown'; });
887
- if (poses.length > 0) {
888
- description += " (".concat(poses.join(', '), ")");
889
- }
890
- }
891
- if (objects.length > 0 && people.length > 0) {
892
- description += ' and';
893
- }
894
- if (objects.length > 0) {
895
- objectTypes = __spreadArray([], new Set(objects.map(function (o) { return o.type; })), true);
896
- description += " ".concat(objectTypes.join(', '));
897
- }
898
- if (people.length === 0 && objects.length === 0) {
899
- description = 'Scene appears to be empty or static';
900
- }
901
- return [2 /*return*/, description];
902
- }
903
- // Final fallback
904
- return [2 /*return*/, 'Visual scene captured'];
905
- case 8:
906
- error_7 = _b.sent();
907
- core_1.logger.error('[VisionService] VLM description failed:', error_7);
908
- return [2 /*return*/, 'Unable to describe scene'];
909
- case 9: return [2 /*return*/];
910
- }
911
- });
912
- });
913
- };
914
- VisionService.prototype.detectMotionObjects = function (frame) {
915
- return __awaiter(this, void 0, void 0, function () {
916
- var objects, blockSize, motionThreshold, y, x, blockMotion, pixelCount, by, bx, px, py, idx, r1, g1, b1, r2, g2, b2, diff, motionPercentage, merged, filtered;
917
- return __generator(this, function (_a) {
918
- if (!this.lastFrame) {
919
- return [2 /*return*/, []];
920
- }
921
- objects = [];
922
- blockSize = 64;
923
- motionThreshold = 50;
924
- // Divide frame into blocks and detect motion regions
925
- for (y = 0; y < frame.height - blockSize; y += blockSize / 2) {
926
- // Overlap blocks
927
- for (x = 0; x < frame.width - blockSize; x += blockSize / 2) {
928
- blockMotion = 0;
929
- pixelCount = 0;
930
- // Check motion in this block
931
- for (by = 0; by < blockSize; by += 2) {
932
- // Sample every other pixel for speed
933
- for (bx = 0; bx < blockSize; bx += 2) {
934
- px = x + bx;
935
- py = y + by;
936
- idx = (py * frame.width + px) * 4;
937
- if (idx < frame.data.length && idx < this.lastFrame.data.length) {
938
- r1 = frame.data[idx];
939
- g1 = frame.data[idx + 1];
940
- b1 = frame.data[idx + 2];
941
- r2 = this.lastFrame.data[idx];
942
- g2 = this.lastFrame.data[idx + 1];
943
- b2 = this.lastFrame.data[idx + 2];
944
- diff = Math.abs(r1 - r2) + Math.abs(g1 - g2) + Math.abs(b1 - b2);
945
- if (diff > motionThreshold) {
946
- blockMotion++;
947
- }
948
- pixelCount++;
949
- }
950
- }
951
- }
952
- motionPercentage = (blockMotion / pixelCount) * 100;
953
- if (motionPercentage > 30) {
954
- // 30% of sampled pixels show motion
955
- objects.push({
956
- id: "motion-".concat(x, "-").concat(y, "-").concat(frame.timestamp),
957
- type: 'motion-object',
958
- confidence: Math.min(motionPercentage / 100, 1),
959
- boundingBox: {
960
- x: x,
961
- y: y,
962
- width: blockSize,
963
- height: blockSize,
964
- },
965
- });
966
- }
967
- }
968
- }
969
- merged = this.mergeAdjacentObjects(objects);
970
- filtered = merged.filter(function (obj) {
971
- var area = obj.boundingBox.width * obj.boundingBox.height;
972
- return area > 2000; // Minimum area threshold
973
- });
974
- return [2 /*return*/, filtered];
975
- });
976
- });
977
- };
978
- VisionService.prototype.mergeAdjacentObjects = function (objects) {
979
- if (objects.length === 0) {
980
- return [];
981
- }
982
- var merged = [];
983
- var used = new Set();
984
- var mergeDistance = 80; // Distance to consider objects adjacent
985
- for (var i = 0; i < objects.length; i++) {
986
- if (used.has(i)) {
987
- continue;
988
- }
989
- var current = objects[i];
990
- var cluster = [current];
991
- used.add(i);
992
- // Find all adjacent objects
993
- var foundNew = true;
994
- while (foundNew) {
995
- foundNew = false;
996
- for (var j = 0; j < objects.length; j++) {
997
- if (used.has(j)) {
998
- continue;
999
- }
1000
- var other = objects[j];
1001
- // Check if adjacent to any object in cluster
1002
- for (var _i = 0, cluster_1 = cluster; _i < cluster_1.length; _i++) {
1003
- var clusterObj = cluster_1[_i];
1004
- var isAdjacent = Math.abs(clusterObj.boundingBox.x - other.boundingBox.x) <= mergeDistance &&
1005
- Math.abs(clusterObj.boundingBox.y - other.boundingBox.y) <= mergeDistance;
1006
- if (isAdjacent) {
1007
- cluster.push(other);
1008
- used.add(j);
1009
- foundNew = true;
1010
- break;
1011
- }
1012
- }
1013
- }
1014
- }
1015
- // Merge cluster into single object
1016
- if (cluster.length > 0) {
1017
- var minX = Math.min.apply(Math, cluster.map(function (o) { return o.boundingBox.x; }));
1018
- var minY = Math.min.apply(Math, cluster.map(function (o) { return o.boundingBox.y; }));
1019
- var maxX = Math.max.apply(Math, cluster.map(function (o) { return o.boundingBox.x + o.boundingBox.width; }));
1020
- var maxY = Math.max.apply(Math, cluster.map(function (o) { return o.boundingBox.y + o.boundingBox.height; }));
1021
- var avgConfidence = cluster.reduce(function (sum, o) { return sum + o.confidence; }, 0) / cluster.length;
1022
- merged.push({
1023
- id: "merged-".concat(minX, "-").concat(minY, "-").concat(Date.now()),
1024
- type: this.classifyObjectBySize(maxX - minX, maxY - minY),
1025
- confidence: avgConfidence,
1026
- boundingBox: {
1027
- x: minX,
1028
- y: minY,
1029
- width: maxX - minX,
1030
- height: maxY - minY,
1031
- },
1032
- });
1033
- }
1034
- }
1035
- return merged;
1036
- };
1037
- VisionService.prototype.classifyObjectBySize = function (width, height) {
1038
- var area = width * height;
1039
- var aspectRatio = width / height;
1040
- // Improved classification heuristics
1041
- if (area > 30000 && aspectRatio > 0.4 && aspectRatio < 0.8) {
1042
- return 'person-candidate';
1043
- }
1044
- else if (area > 20000) {
1045
- return 'large-object';
1046
- }
1047
- else if (area > 8000) {
1048
- return 'medium-object';
1049
- }
1050
- else {
1051
- return 'small-object';
1052
- }
1053
- };
1054
- VisionService.prototype.detectPeopleFromMotion = function (frame, objects) {
1055
- return __awaiter(this, void 0, void 0, function () {
1056
- var people, personCandidates, i, candidate, box, aspectRatio, pose, facing;
1057
- return __generator(this, function (_a) {
1058
- people = [];
1059
- personCandidates = objects.filter(function (o) { return o.type === 'person-candidate'; });
1060
- for (i = 0; i < personCandidates.length; i++) {
1061
- candidate = personCandidates[i];
1062
- box = candidate.boundingBox;
1063
- aspectRatio = box.width / box.height;
1064
- pose = 'unknown';
1065
- if (aspectRatio < 0.6) {
1066
- pose = 'standing';
1067
- }
1068
- else if (aspectRatio > 1.2) {
1069
- pose = 'lying';
1070
- }
1071
- else {
1072
- pose = 'sitting';
1073
- }
1074
- facing = 'unknown';
1075
- if (this.lastFrame) {
1076
- // In Phase 1, we'll just use 'unknown' or random assignment
1077
- // Phase 2 will implement proper pose detection
1078
- facing = 'camera'; // Default assumption
1079
- }
1080
- people.push({
1081
- id: "person-".concat(i, "-").concat(frame.timestamp),
1082
- confidence: candidate.confidence,
1083
- pose: pose,
1084
- facing: facing,
1085
- boundingBox: box,
1086
- });
1087
- }
1088
- return [2 /*return*/, people];
1089
- });
1090
- });
1091
- };
1092
- VisionService.prototype.startScreenProcessing = function () {
1093
- var _this = this;
1094
- if (this.screenProcessingInterval) {
1095
- return;
1096
- }
1097
- this.screenProcessingInterval = setInterval(function () { return __awaiter(_this, void 0, void 0, function () {
1098
- var error_8;
1099
- return __generator(this, function (_a) {
1100
- switch (_a.label) {
1101
- case 0:
1102
- if (!!this.isProcessingScreen) return [3 /*break*/, 5];
1103
- this.isProcessingScreen = true;
1104
- _a.label = 1;
1105
- case 1:
1106
- _a.trys.push([1, 3, , 4]);
1107
- return [4 /*yield*/, this.captureAndProcessScreen()];
1108
- case 2:
1109
- _a.sent();
1110
- return [3 /*break*/, 4];
1111
- case 3:
1112
- error_8 = _a.sent();
1113
- core_1.logger.error('[VisionService] Screen processing error:', error_8);
1114
- return [3 /*break*/, 4];
1115
- case 4:
1116
- this.isProcessingScreen = false;
1117
- _a.label = 5;
1118
- case 5: return [2 /*return*/];
1119
- }
1120
- });
1121
- }); }, this.visionConfig.screenCaptureInterval || 2000);
1122
- core_1.logger.debug('[VisionService] Started screen processing loop');
1123
- };
1124
- VisionService.prototype.captureAndProcessScreen = function () {
1125
- return __awaiter(this, void 0, void 0, function () {
1126
- var capture, activeTile, tileAnalysis, error_9;
1127
- return __generator(this, function (_a) {
1128
- switch (_a.label) {
1129
- case 0:
1130
- _a.trys.push([0, 5, , 6]);
1131
- return [4 /*yield*/, this.screenCapture.captureScreen()];
1132
- case 1:
1133
- capture = _a.sent();
1134
- this.lastScreenCapture = capture;
1135
- activeTile = this.screenCapture.getActiveTile();
1136
- if (!(activeTile && activeTile.data)) return [3 /*break*/, 3];
1137
- return [4 /*yield*/, this.analyzeTile(activeTile)];
1138
- case 2:
1139
- tileAnalysis = _a.sent();
1140
- activeTile.analysis = tileAnalysis;
1141
- _a.label = 3;
1142
- case 3:
1143
- // Update enhanced scene description
1144
- return [4 /*yield*/, this.updateEnhancedSceneDescription()];
1145
- case 4:
1146
- // Update enhanced scene description
1147
- _a.sent();
1148
- return [3 /*break*/, 6];
1149
- case 5:
1150
- error_9 = _a.sent();
1151
- core_1.logger.error('[VisionService] Error capturing screen:', error_9);
1152
- return [3 /*break*/, 6];
1153
- case 6: return [2 /*return*/];
1154
- }
1155
- });
1156
- });
1157
- };
1158
- VisionService.prototype.analyzeTile = function (tile) {
1159
- return __awaiter(this, void 0, void 0, function () {
1160
- var analysis, _a, _b, error_10;
1161
- var _c;
1162
- return __generator(this, function (_d) {
1163
- switch (_d.label) {
1164
- case 0:
1165
- analysis = {
1166
- timestamp: Date.now(),
1167
- };
1168
- _d.label = 1;
1169
- case 1:
1170
- _d.trys.push([1, 6, , 7]);
1171
- if (!(this.visionConfig.florence2Enabled && tile.data)) return [3 /*break*/, 3];
1172
- _a = analysis;
1173
- return [4 /*yield*/, this.florence2.analyzeTile(tile)];
1174
- case 2:
1175
- _a.florence2 = _d.sent();
1176
- analysis.summary = analysis.florence2.caption;
1177
- _d.label = 3;
1178
- case 3:
1179
- if (!(this.visionConfig.ocrEnabled && tile.data)) return [3 /*break*/, 5];
1180
- _b = analysis;
1181
- return [4 /*yield*/, this.ocrService.extractFromTile(tile)];
1182
- case 4:
1183
- _b.ocr = _d.sent();
1184
- analysis.text = analysis.ocr.fullText;
1185
- _d.label = 5;
1186
- case 5:
1187
- // Extract objects from Florence-2 results
1188
- if ((_c = analysis.florence2) === null || _c === void 0 ? void 0 : _c.objects) {
1189
- analysis.objects = analysis.florence2.objects.map(function (obj) { return ({
1190
- id: "screen-obj-".concat(Date.now(), "-").concat(Math.random()),
1191
- type: obj.label,
1192
- confidence: obj.confidence,
1193
- boundingBox: obj.bbox,
1194
- }); });
1195
- }
1196
- return [3 /*break*/, 7];
1197
- case 6:
1198
- error_10 = _d.sent();
1199
- core_1.logger.error('[VisionService] Error analyzing tile:', error_10);
1200
- return [3 /*break*/, 7];
1201
- case 7: return [2 /*return*/, analysis];
1202
- }
1203
- });
1204
- });
1205
- };
1206
- VisionService.prototype.updateEnhancedSceneDescription = function () {
1207
- return __awaiter(this, void 0, void 0, function () {
1208
- var enhancedScene, processedTiles, tilesWithContent, windows;
1209
- var _a, _b, _c;
1210
- return __generator(this, function (_d) {
1211
- if (!this.lastScreenCapture) {
1212
- return [2 /*return*/];
1213
- }
1214
- enhancedScene = __assign(__assign({}, (this.lastSceneDescription || {
1215
- timestamp: Date.now(),
1216
- description: '',
1217
- objects: [],
1218
- people: [],
1219
- sceneChanged: false,
1220
- changePercentage: 0,
1221
- })), { screenCapture: this.lastScreenCapture, screenAnalysis: {
1222
- fullScreenOCR: '',
1223
- activeTile: (_a = this.screenCapture.getActiveTile()) === null || _a === void 0 ? void 0 : _a.analysis,
1224
- gridSummary: '',
1225
- focusedApp: '',
1226
- uiElements: [],
1227
- } });
1228
- processedTiles = this.lastScreenCapture.tiles.filter(function (t) { var _a; return (_a = t.analysis) === null || _a === void 0 ? void 0 : _a.ocr; });
1229
- if (processedTiles.length > 0) {
1230
- enhancedScene.screenAnalysis.fullScreenOCR = processedTiles
1231
- .map(function (t) { return t.analysis.ocr.fullText; })
1232
- .join('\n');
1233
- }
1234
- // Generate grid summary
1235
- if (this.lastScreenCapture.tiles.length > 0) {
1236
- tilesWithContent = this.lastScreenCapture.tiles.filter(function (t) { return t.analysis; });
1237
- enhancedScene.screenAnalysis.gridSummary = "Screen divided into ".concat(this.lastScreenCapture.tiles.length, " tiles, ").concat(tilesWithContent.length, " analyzed");
1238
- }
1239
- // Detect focused application (heuristic based on UI elements)
1240
- if ((_c = (_b = enhancedScene.screenAnalysis.activeTile) === null || _b === void 0 ? void 0 : _b.florence2) === null || _c === void 0 ? void 0 : _c.objects) {
1241
- windows = enhancedScene.screenAnalysis.activeTile.florence2.objects.filter(function (obj) { return obj.label === 'window'; });
1242
- if (windows.length > 0) {
1243
- enhancedScene.screenAnalysis.focusedApp = 'Desktop Application';
1244
- }
1245
- }
1246
- this.lastEnhancedScene = enhancedScene;
1247
- return [2 /*return*/];
1248
- });
1249
- });
1250
- };
1251
- // Public API methods
1252
- VisionService.prototype.getCurrentFrame = function () {
1253
- return __awaiter(this, void 0, void 0, function () {
1254
- return __generator(this, function (_a) {
1255
- return [2 /*return*/, this.lastFrame];
1256
- });
1257
- });
1258
- };
1259
- VisionService.prototype.getSceneDescription = function () {
1260
- return __awaiter(this, void 0, void 0, function () {
1261
- return __generator(this, function (_a) {
1262
- return [2 /*return*/, this.lastSceneDescription];
1263
- });
1264
- });
1265
- };
1266
- VisionService.prototype.getEnhancedSceneDescription = function () {
1267
- return __awaiter(this, void 0, void 0, function () {
1268
- return __generator(this, function (_a) {
1269
- // If worker manager is available, use its high-FPS data
1270
- if (this.workerManager) {
1271
- return [2 /*return*/, this.workerManager.getLatestEnhancedScene()];
1272
- }
1273
- // Otherwise fall back to standard processing
1274
- return [2 /*return*/, this.lastEnhancedScene || this.lastSceneDescription];
1275
- });
1276
- });
1277
- };
1278
- VisionService.prototype.getScreenCapture = function () {
1279
- return __awaiter(this, void 0, void 0, function () {
1280
- return __generator(this, function (_a) {
1281
- return [2 /*return*/, this.lastScreenCapture];
1282
- });
1283
- });
1284
- };
1285
- VisionService.prototype.getVisionMode = function () {
1286
- return this.visionConfig.visionMode || types_1.VisionMode.CAMERA;
1287
- };
1288
- VisionService.prototype.setVisionMode = function (mode) {
1289
- return __awaiter(this, void 0, void 0, function () {
1290
- return __generator(this, function (_a) {
1291
- switch (_a.label) {
1292
- case 0:
1293
- core_1.logger.info("[VisionService] Changing vision mode from ".concat(this.visionConfig.visionMode, " to ").concat(mode));
1294
- // Stop current processing
1295
- this.stopProcessing();
1296
- // Update configuration
1297
- this.visionConfig.visionMode = mode;
1298
- // Reinitialize based on new mode
1299
- if (mode === types_1.VisionMode.OFF) {
1300
- core_1.logger.info('[VisionService] Vision disabled');
1301
- return [2 /*return*/];
1302
- }
1303
- if (!((mode === types_1.VisionMode.CAMERA || mode === types_1.VisionMode.BOTH) && !this.camera)) return [3 /*break*/, 2];
1304
- return [4 /*yield*/, this.initializeCameraVision()];
1305
- case 1:
1306
- _a.sent();
1307
- _a.label = 2;
1308
- case 2:
1309
- if (!((mode === types_1.VisionMode.SCREEN || mode === types_1.VisionMode.BOTH) &&
1310
- (!this.florence2.isInitialized() || !this.ocrService.isInitialized()))) return [3 /*break*/, 4];
1311
- return [4 /*yield*/, this.initializeScreenVision()];
1312
- case 3:
1313
- _a.sent();
1314
- _a.label = 4;
1315
- case 4:
1316
- // Start processing for new mode
1317
- this.startProcessing();
1318
- return [2 /*return*/];
1319
- }
1320
- });
1321
- });
1322
- };
1323
- VisionService.prototype.stopProcessing = function () {
1324
- if (this.frameProcessingInterval) {
1325
- clearInterval(this.frameProcessingInterval);
1326
- this.frameProcessingInterval = null;
1327
- }
1328
- if (this.screenProcessingInterval) {
1329
- clearInterval(this.screenProcessingInterval);
1330
- this.screenProcessingInterval = null;
1331
- }
1332
- };
1333
- VisionService.prototype.getCameraInfo = function () {
1334
- if (!this.camera) {
1335
- return null;
1336
- }
1337
- return {
1338
- id: this.camera.id,
1339
- name: this.camera.name,
1340
- connected: true,
1341
- };
1342
- };
1343
- VisionService.prototype.isActive = function () {
1344
- return this.camera !== null && this.frameProcessingInterval !== null;
1345
- };
1346
- // Helper methods for face recognition
1347
- VisionService.prototype.calculateBoxOverlap = function (box1, box2) {
1348
- var x1 = Math.max(box1.x, box2.x);
1349
- var y1 = Math.max(box1.y, box2.y);
1350
- var x2 = Math.min(box1.x + box1.width, box2.x + box2.width);
1351
- var y2 = Math.min(box1.y + box1.height, box2.y + box2.height);
1352
- if (x2 < x1 || y2 < y1) {
1353
- return 0;
1354
- }
1355
- var intersection = (x2 - x1) * (y2 - y1);
1356
- var area1 = box1.width * box1.height;
1357
- var area2 = box2.width * box2.height;
1358
- var union = area1 + area2 - intersection;
1359
- return intersection / union;
1360
- };
1361
- VisionService.prototype.getDominantExpression = function (expressions) {
1362
- var maxValue = 0;
1363
- var dominantExpression = 'neutral';
1364
- for (var _i = 0, _a = Object.entries(expressions); _i < _a.length; _i++) {
1365
- var _b = _a[_i], expression = _b[0], value = _b[1];
1366
- if (typeof value === 'number' && value > maxValue) {
1367
- maxValue = value;
1368
- dominantExpression = expression;
1369
- }
1370
- }
1371
- return dominantExpression;
1372
- };
1373
- // Public methods for entity tracking
1374
- VisionService.prototype.getEntityTracker = function () {
1375
- return this.entityTracker;
1376
- };
1377
- VisionService.prototype.getFaceRecognition = function () {
1378
- return this.faceRecognition;
1379
- };
1380
- VisionService.prototype.stop = function () {
1381
- return __awaiter(this, void 0, void 0, function () {
1382
- return __generator(this, function (_a) {
1383
- switch (_a.label) {
1384
- case 0:
1385
- core_1.logger.info('[VisionService] Stopping vision service...');
1386
- this.stopProcessing();
1387
- if (!this.audioCapture) return [3 /*break*/, 2];
1388
- return [4 /*yield*/, this.audioCapture.stop()];
1389
- case 1:
1390
- _a.sent();
1391
- this.audioCapture = null;
1392
- _a.label = 2;
1393
- case 2:
1394
- if (!this.streamingAudioCapture) return [3 /*break*/, 4];
1395
- return [4 /*yield*/, this.streamingAudioCapture.stop()];
1396
- case 3:
1397
- _a.sent();
1398
- this.streamingAudioCapture = null;
1399
- _a.label = 4;
1400
- case 4:
1401
- if (!this.visionModels) return [3 /*break*/, 6];
1402
- return [4 /*yield*/, this.visionModels.dispose()];
1403
- case 5:
1404
- _a.sent();
1405
- _a.label = 6;
1406
- case 6:
1407
- if (!this.workerManager) return [3 /*break*/, 8];
1408
- return [4 /*yield*/, this.workerManager.stop()];
1409
- case 7:
1410
- _a.sent();
1411
- this.workerManager = null;
1412
- _a.label = 8;
1413
- case 8:
1414
- this.camera = null;
1415
- this.lastFrame = null;
1416
- this.lastSceneDescription = null;
1417
- this.lastScreenCapture = null;
1418
- this.lastEnhancedScene = null;
1419
- this.isProcessing = false;
1420
- this.isProcessingScreen = false;
1421
- // Dispose of models
1422
- return [4 /*yield*/, this.florence2.dispose()];
1423
- case 9:
1424
- // Dispose of models
1425
- _a.sent();
1426
- return [4 /*yield*/, this.ocrService.dispose()];
1427
- case 10:
1428
- _a.sent();
1429
- core_1.logger.info('[VisionService] Stopped.');
1430
- return [2 /*return*/];
1431
- }
1432
- });
1433
- });
1434
- };
1435
- VisionService.prototype.findCamera = function () {
1436
- return __awaiter(this, void 0, void 0, function () {
1437
- var cameras, searchName_1, matchedCamera, error_11;
1438
- return __generator(this, function (_a) {
1439
- switch (_a.label) {
1440
- case 0:
1441
- _a.trys.push([0, 2, , 3]);
1442
- return [4 /*yield*/, this.listCameras()];
1443
- case 1:
1444
- cameras = _a.sent();
1445
- if (cameras.length === 0) {
1446
- core_1.logger.warn('[VisionService] No cameras detected');
1447
- return [2 /*return*/, null];
1448
- }
1449
- // If camera name is specified, try to find it
1450
- if (this.visionConfig.cameraName) {
1451
- searchName_1 = this.visionConfig.cameraName.toLowerCase();
1452
- matchedCamera = cameras.find(function (cam) { return cam.name.toLowerCase().includes(searchName_1); });
1453
- if (matchedCamera) {
1454
- return [2 /*return*/, this.createCameraDevice(matchedCamera)];
1455
- }
1456
- core_1.logger.warn("[VisionService] Camera \"".concat(this.visionConfig.cameraName, "\" not found, using default"));
1457
- }
1458
- // Use first available camera
1459
- return [2 /*return*/, this.createCameraDevice(cameras[0])];
1460
- case 2:
1461
- error_11 = _a.sent();
1462
- core_1.logger.error('[VisionService] Error finding camera:', error_11);
1463
- return [2 /*return*/, null];
1464
- case 3: return [2 /*return*/];
1465
- }
1466
- });
1467
- });
1468
- };
1469
- VisionService.prototype.listCameras = function () {
1470
- return __awaiter(this, void 0, void 0, function () {
1471
- var platform, stdout, data, cameras, _i, _a, camera, stdout, cameras, lines, currentName, _b, lines_1, line, devicePath, id, stdout, devices, cameras, _c, devices_1, device, error_12;
1472
- return __generator(this, function (_d) {
1473
- switch (_d.label) {
1474
- case 0:
1475
- platform = process.platform;
1476
- _d.label = 1;
1477
- case 1:
1478
- _d.trys.push([1, 8, , 9]);
1479
- if (!(platform === 'darwin')) return [3 /*break*/, 3];
1480
- return [4 /*yield*/, execAsync('system_profiler SPCameraDataType -json')];
1481
- case 2:
1482
- stdout = (_d.sent()).stdout;
1483
- data = JSON.parse(stdout);
1484
- cameras = [];
1485
- if (data.SPCameraDataType && Array.isArray(data.SPCameraDataType)) {
1486
- for (_i = 0, _a = data.SPCameraDataType; _i < _a.length; _i++) {
1487
- camera = _a[_i];
1488
- cameras.push({
1489
- id: camera.unique_id || camera._name,
1490
- name: camera._name,
1491
- connected: true,
1492
- });
1493
- }
1494
- }
1495
- return [2 /*return*/, cameras];
1496
- case 3:
1497
- if (!(platform === 'linux')) return [3 /*break*/, 5];
1498
- return [4 /*yield*/, execAsync('v4l2-ctl --list-devices')];
1499
- case 4:
1500
- stdout = (_d.sent()).stdout;
1501
- cameras = [];
1502
- lines = stdout.split('\n');
1503
- currentName = '';
1504
- for (_b = 0, lines_1 = lines; _b < lines_1.length; _b++) {
1505
- line = lines_1[_b];
1506
- if (line && !line.startsWith('\t')) {
1507
- currentName = line.replace(':', '').trim();
1508
- }
1509
- else if (line.trim().startsWith('/dev/video')) {
1510
- devicePath = line.trim();
1511
- id = devicePath.replace('/dev/video', '');
1512
- cameras.push({
1513
- id: id,
1514
- name: currentName,
1515
- connected: true,
1516
- });
1517
- }
1518
- }
1519
- return [2 /*return*/, cameras];
1520
- case 5:
1521
- if (!(platform === 'win32')) return [3 /*break*/, 7];
1522
- return [4 /*yield*/, execAsync('powershell -Command "Get-PnpDevice -Class Camera | Select-Object FriendlyName, InstanceId | ConvertTo-Json"')];
1523
- case 6:
1524
- stdout = (_d.sent()).stdout;
1525
- devices = JSON.parse(stdout);
1526
- cameras = [];
1527
- if (Array.isArray(devices)) {
1528
- for (_c = 0, devices_1 = devices; _c < devices_1.length; _c++) {
1529
- device = devices_1[_c];
1530
- cameras.push({
1531
- id: device.InstanceId,
1532
- name: device.FriendlyName,
1533
- connected: true,
1534
- });
1535
- }
1536
- }
1537
- return [2 /*return*/, cameras];
1538
- case 7: return [2 /*return*/, []];
1539
- case 8:
1540
- error_12 = _d.sent();
1541
- core_1.logger.error('[VisionService] Error listing cameras:', error_12);
1542
- return [2 /*return*/, []];
1543
- case 9: return [2 /*return*/];
1544
- }
1545
- });
1546
- });
1547
- };
1548
- VisionService.prototype.createCameraDevice = function (info) {
1549
- var _this = this;
1550
- var platform = process.platform;
1551
- return {
1552
- id: info.id,
1553
- name: info.name,
1554
- capture: function () { return __awaiter(_this, void 0, void 0, function () {
1555
- var tempFile, error_13, error_14, error_15, imageBuffer, error_16;
1556
- return __generator(this, function (_a) {
1557
- switch (_a.label) {
1558
- case 0:
1559
- tempFile = path.join(process.cwd(), "temp_capture_".concat(Date.now(), ".jpg"));
1560
- _a.label = 1;
1561
- case 1:
1562
- _a.trys.push([1, 20, , 22]);
1563
- if (!(platform === 'darwin')) return [3 /*break*/, 6];
1564
- _a.label = 2;
1565
- case 2:
1566
- _a.trys.push([2, 4, , 5]);
1567
- return [4 /*yield*/, execAsync("imagesnap -d \"".concat(info.name, "\" \"").concat(tempFile, "\""))];
1568
- case 3:
1569
- _a.sent();
1570
- return [3 /*break*/, 5];
1571
- case 4:
1572
- error_13 = _a.sent();
1573
- if (error_13.message.includes('command not found')) {
1574
- throw new Error('imagesnap not installed. Run: brew install imagesnap');
1575
- }
1576
- throw error_13;
1577
- case 5: return [3 /*break*/, 17];
1578
- case 6:
1579
- if (!(platform === 'linux')) return [3 /*break*/, 11];
1580
- _a.label = 7;
1581
- case 7:
1582
- _a.trys.push([7, 9, , 10]);
1583
- return [4 /*yield*/, execAsync("fswebcam -d /dev/video".concat(info.id, " -r 1280x720 --jpeg 85 \"").concat(tempFile, "\""))];
1584
- case 8:
1585
- _a.sent();
1586
- return [3 /*break*/, 10];
1587
- case 9:
1588
- error_14 = _a.sent();
1589
- if (error_14.message.includes('command not found')) {
1590
- throw new Error('fswebcam not installed. Run: sudo apt-get install fswebcam');
1591
- }
1592
- throw error_14;
1593
- case 10: return [3 /*break*/, 17];
1594
- case 11:
1595
- if (!(platform === 'win32')) return [3 /*break*/, 16];
1596
- _a.label = 12;
1597
- case 12:
1598
- _a.trys.push([12, 14, , 15]);
1599
- return [4 /*yield*/, execAsync("ffmpeg -f dshow -i video=\"".concat(info.name, "\" -frames:v 1 -q:v 2 \"").concat(tempFile, "\" -y"))];
1600
- case 13:
1601
- _a.sent();
1602
- return [3 /*break*/, 15];
1603
- case 14:
1604
- error_15 = _a.sent();
1605
- if (error_15.message.includes('not recognized') || error_15.message.includes('not found')) {
1606
- throw new Error('ffmpeg not installed. Download from ffmpeg.org and add to PATH');
1607
- }
1608
- throw error_15;
1609
- case 15: return [3 /*break*/, 17];
1610
- case 16: throw new Error("Unsupported platform: ".concat(platform));
1611
- case 17: return [4 /*yield*/, fs.readFile(tempFile)];
1612
- case 18:
1613
- imageBuffer = _a.sent();
1614
- // Clean up temp file
1615
- return [4 /*yield*/, fs.unlink(tempFile).catch(function () { })];
1616
- case 19:
1617
- // Clean up temp file
1618
- _a.sent();
1619
- return [2 /*return*/, imageBuffer];
1620
- case 20:
1621
- error_16 = _a.sent();
1622
- // Clean up temp file on error
1623
- return [4 /*yield*/, fs.unlink(tempFile).catch(function () { })];
1624
- case 21:
1625
- // Clean up temp file on error
1626
- _a.sent();
1627
- throw error_16;
1628
- case 22: return [2 /*return*/];
1629
- }
1630
- });
1631
- }); },
1632
- };
1633
- };
1634
- VisionService.prototype.captureImage = function () {
1635
- return __awaiter(this, void 0, void 0, function () {
1636
- var error_17;
1637
- return __generator(this, function (_a) {
1638
- switch (_a.label) {
1639
- case 0:
1640
- if (!this.camera) {
1641
- core_1.logger.warn('[VisionService] No camera available for capture');
1642
- return [2 /*return*/, null];
1643
- }
1644
- _a.label = 1;
1645
- case 1:
1646
- _a.trys.push([1, 3, , 4]);
1647
- return [4 /*yield*/, this.camera.capture()];
1648
- case 2: return [2 /*return*/, _a.sent()];
1649
- case 3:
1650
- error_17 = _a.sent();
1651
- core_1.logger.error('[VisionService] Failed to capture image:', error_17);
1652
- return [2 /*return*/, null];
1653
- case 4: return [2 /*return*/];
1654
- }
1655
- });
1656
- });
1657
- };
1658
- VisionService.serviceType = types_1.VisionServiceType.VISION;
1659
- return VisionService;
1660
- }(core_1.Service));
1661
- exports.VisionService = VisionService;
1662
- //# sourceMappingURL=service.js.map