@elizaos/plugin-vision 1.2.1 → 2.0.0-alpha.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (105) hide show
  1. package/build.config.ts +53 -53
  2. package/dist/index.js +6716 -67
  3. package/dist/index.js.map +33 -1
  4. package/dist/workers/florence2-worker.js +112304 -307
  5. package/dist/workers/florence2-worker.js.map +92 -1
  6. package/dist/workers/ocr-worker.js +119718 -339
  7. package/dist/workers/ocr-worker.js.map +137 -1
  8. package/dist/workers/screen-capture-worker.js +350 -418
  9. package/dist/workers/screen-capture-worker.js.map +11 -1
  10. package/package.json +13 -17
  11. package/README.md +0 -270
  12. package/dist/action.d.ts +0 -8
  13. package/dist/action.js +0 -1212
  14. package/dist/action.js.map +0 -1
  15. package/dist/audio-capture-stream.d.ts +0 -42
  16. package/dist/audio-capture-stream.js +0 -516
  17. package/dist/audio-capture-stream.js.map +0 -1
  18. package/dist/audio-capture.d.ts +0 -25
  19. package/dist/audio-capture.js +0 -412
  20. package/dist/audio-capture.js.map +0 -1
  21. package/dist/basic.test.d.ts +0 -1
  22. package/dist/basic.test.js +0 -97
  23. package/dist/basic.test.js.map +0 -1
  24. package/dist/config.d.ts +0 -73
  25. package/dist/config.js +0 -254
  26. package/dist/config.js.map +0 -1
  27. package/dist/entity-tracker.d.ts +0 -32
  28. package/dist/entity-tracker.js +0 -361
  29. package/dist/entity-tracker.js.map +0 -1
  30. package/dist/errors.d.ts +0 -67
  31. package/dist/errors.js +0 -395
  32. package/dist/errors.js.map +0 -1
  33. package/dist/face-recognition.d.ts +0 -31
  34. package/dist/face-recognition.js +0 -332
  35. package/dist/face-recognition.js.map +0 -1
  36. package/dist/florence2-local.d.ts +0 -25
  37. package/dist/florence2-local.js +0 -280
  38. package/dist/florence2-local.js.map +0 -1
  39. package/dist/florence2-model.d.ts +0 -36
  40. package/dist/florence2-model.js +0 -503
  41. package/dist/florence2-model.js.map +0 -1
  42. package/dist/index.d.ts +0 -3
  43. package/dist/ocr-service-real.d.ts +0 -32
  44. package/dist/ocr-service-real.js +0 -396
  45. package/dist/ocr-service-real.js.map +0 -1
  46. package/dist/ocr-service.d.ts +0 -28
  47. package/dist/ocr-service.js +0 -216
  48. package/dist/ocr-service.js.map +0 -1
  49. package/dist/provider.d.ts +0 -2
  50. package/dist/provider.js +0 -285
  51. package/dist/provider.js.map +0 -1
  52. package/dist/screen-capture.d.ts +0 -16
  53. package/dist/screen-capture.js +0 -302
  54. package/dist/screen-capture.js.map +0 -1
  55. package/dist/service.d.ts +0 -73
  56. package/dist/service.js +0 -1662
  57. package/dist/service.js.map +0 -1
  58. package/dist/tests/e2e/index.d.ts +0 -8
  59. package/dist/tests/e2e/index.js +0 -33
  60. package/dist/tests/e2e/index.js.map +0 -1
  61. package/dist/tests/e2e/run-local.d.ts +0 -2
  62. package/dist/tests/e2e/run-local.js +0 -166
  63. package/dist/tests/e2e/run-local.js.map +0 -1
  64. package/dist/tests/e2e/screen-vision.d.ts +0 -11
  65. package/dist/tests/e2e/screen-vision.js +0 -384
  66. package/dist/tests/e2e/screen-vision.js.map +0 -1
  67. package/dist/tests/e2e/vision-autonomy.d.ts +0 -11
  68. package/dist/tests/e2e/vision-autonomy.js +0 -375
  69. package/dist/tests/e2e/vision-autonomy.js.map +0 -1
  70. package/dist/tests/e2e/vision-basic.d.ts +0 -11
  71. package/dist/tests/e2e/vision-basic.js +0 -434
  72. package/dist/tests/e2e/vision-basic.js.map +0 -1
  73. package/dist/tests/e2e/vision-capture-log.d.ts +0 -11
  74. package/dist/tests/e2e/vision-capture-log.js +0 -302
  75. package/dist/tests/e2e/vision-capture-log.js.map +0 -1
  76. package/dist/tests/e2e/vision-runtime.d.ts +0 -11
  77. package/dist/tests/e2e/vision-runtime.js +0 -357
  78. package/dist/tests/e2e/vision-runtime.js.map +0 -1
  79. package/dist/tests/e2e/vision-worker-tests.d.ts +0 -11
  80. package/dist/tests/e2e/vision-worker-tests.js +0 -466
  81. package/dist/tests/e2e/vision-worker-tests.js.map +0 -1
  82. package/dist/tests/test-pattern-generator.d.ts +0 -40
  83. package/dist/tests/test-pattern-generator.js +0 -191
  84. package/dist/tests/test-pattern-generator.js.map +0 -1
  85. package/dist/tests.d.ts +0 -3
  86. package/dist/tests.js +0 -11
  87. package/dist/tests.js.map +0 -1
  88. package/dist/types.d.ts +0 -222
  89. package/dist/types.js +0 -16
  90. package/dist/types.js.map +0 -1
  91. package/dist/vision-models.d.ts +0 -47
  92. package/dist/vision-models.js +0 -501
  93. package/dist/vision-models.js.map +0 -1
  94. package/dist/vision-worker-manager.d.ts +0 -61
  95. package/dist/vision-worker-manager.js +0 -668
  96. package/dist/vision-worker-manager.js.map +0 -1
  97. package/dist/workers/florence2-worker-simple.d.ts +0 -13
  98. package/dist/workers/florence2-worker-simple.js +0 -121
  99. package/dist/workers/florence2-worker-simple.js.map +0 -1
  100. package/dist/workers/florence2-worker.d.ts +0 -1
  101. package/dist/workers/ocr-worker.d.ts +0 -1
  102. package/dist/workers/screen-capture-worker.d.ts +0 -1
  103. package/dist/workers/worker-logger.d.ts +0 -9
  104. package/dist/workers/worker-logger.js +0 -95
  105. package/dist/workers/worker-logger.js.map +0 -1
@@ -1,280 +0,0 @@
1
- "use strict";
2
- var __awaiter = (this && this.__awaiter) || function (thisArg, _arguments, P, generator) {
3
- function adopt(value) { return value instanceof P ? value : new P(function (resolve) { resolve(value); }); }
4
- return new (P || (P = Promise))(function (resolve, reject) {
5
- function fulfilled(value) { try { step(generator.next(value)); } catch (e) { reject(e); } }
6
- function rejected(value) { try { step(generator["throw"](value)); } catch (e) { reject(e); } }
7
- function step(result) { result.done ? resolve(result.value) : adopt(result.value).then(fulfilled, rejected); }
8
- step((generator = generator.apply(thisArg, _arguments || [])).next());
9
- });
10
- };
11
- var __generator = (this && this.__generator) || function (thisArg, body) {
12
- var _ = { label: 0, sent: function() { if (t[0] & 1) throw t[1]; return t[1]; }, trys: [], ops: [] }, f, y, t, g = Object.create((typeof Iterator === "function" ? Iterator : Object).prototype);
13
- return g.next = verb(0), g["throw"] = verb(1), g["return"] = verb(2), typeof Symbol === "function" && (g[Symbol.iterator] = function() { return this; }), g;
14
- function verb(n) { return function (v) { return step([n, v]); }; }
15
- function step(op) {
16
- if (f) throw new TypeError("Generator is already executing.");
17
- while (g && (g = 0, op[0] && (_ = 0)), _) try {
18
- if (f = 1, y && (t = op[0] & 2 ? y["return"] : op[0] ? y["throw"] || ((t = y["return"]) && t.call(y), 0) : y.next) && !(t = t.call(y, op[1])).done) return t;
19
- if (y = 0, t) op = [op[0] & 2, t.value];
20
- switch (op[0]) {
21
- case 0: case 1: t = op; break;
22
- case 4: _.label++; return { value: op[1], done: false };
23
- case 5: _.label++; y = op[1]; op = [0]; continue;
24
- case 7: op = _.ops.pop(); _.trys.pop(); continue;
25
- default:
26
- if (!(t = _.trys, t = t.length > 0 && t[t.length - 1]) && (op[0] === 6 || op[0] === 2)) { _ = 0; continue; }
27
- if (op[0] === 3 && (!t || (op[1] > t[0] && op[1] < t[3]))) { _.label = op[1]; break; }
28
- if (op[0] === 6 && _.label < t[1]) { _.label = t[1]; t = op; break; }
29
- if (t && _.label < t[2]) { _.label = t[2]; _.ops.push(op); break; }
30
- if (t[2]) _.ops.pop();
31
- _.trys.pop(); continue;
32
- }
33
- op = body.call(thisArg, _);
34
- } catch (e) { op = [6, e]; y = 0; } finally { f = t = 0; }
35
- if (op[0] & 5) throw op[1]; return { value: op[0] ? op[1] : void 0, done: true };
36
- }
37
- };
38
- Object.defineProperty(exports, "__esModule", { value: true });
39
- exports.Florence2Local = void 0;
40
- var tf = require("@tensorflow/tfjs-node");
41
- var core_1 = require("@elizaos/core");
42
- var sharp_1 = require("sharp");
43
- var Florence2Local = /** @class */ (function () {
44
- function Florence2Local(config) {
45
- this.model = null;
46
- this.initialized = false;
47
- // Model constants
48
- this.IMAGE_SIZE = 384; // Florence-2 uses 384x384 input
49
- this.VOCAB_SIZE = 51289;
50
- this.config = {
51
- modelPath: (config === null || config === void 0 ? void 0 : config.modelPath) || './models/florence2',
52
- modelUrl: (config === null || config === void 0 ? void 0 : config.modelUrl) ||
53
- 'https://huggingface.co/microsoft/Florence-2-base/resolve/main/model.json',
54
- cacheDir: (config === null || config === void 0 ? void 0 : config.cacheDir) || './models/cache',
55
- };
56
- }
57
- Florence2Local.prototype.initialize = function () {
58
- return __awaiter(this, void 0, void 0, function () {
59
- var _a, error_1;
60
- return __generator(this, function (_b) {
61
- switch (_b.label) {
62
- case 0:
63
- if (this.initialized) {
64
- return [2 /*return*/];
65
- }
66
- _b.label = 1;
67
- case 1:
68
- _b.trys.push([1, 3, , 4]);
69
- core_1.logger.info('[Florence2Local] Initializing local Florence-2 model...');
70
- // For now, we'll use a simplified vision model approach
71
- // In a real implementation, you would load the actual Florence-2 model
72
- // Since Florence-2 is quite large and complex, we'll use a practical approach
73
- // Instead of loading the full Florence-2 model (which would require significant setup),
74
- // we'll use TensorFlow.js with MobileNet for basic image understanding
75
- // and combine it with other models for a Florence-2-like experience
76
- _a = this;
77
- return [4 /*yield*/, tf.loadGraphModel('https://tfhub.dev/google/tfjs-model/imagenet/mobilenet_v3_small_100_224/feature_vector/5/default/1')];
78
- case 2:
79
- // For now, we'll use a simplified vision model approach
80
- // In a real implementation, you would load the actual Florence-2 model
81
- // Since Florence-2 is quite large and complex, we'll use a practical approach
82
- // Instead of loading the full Florence-2 model (which would require significant setup),
83
- // we'll use TensorFlow.js with MobileNet for basic image understanding
84
- // and combine it with other models for a Florence-2-like experience
85
- _a.model = _b.sent();
86
- this.initialized = true;
87
- core_1.logger.info('[Florence2Local] Model initialized successfully');
88
- return [3 /*break*/, 4];
89
- case 3:
90
- error_1 = _b.sent();
91
- core_1.logger.error('[Florence2Local] Failed to initialize model:', error_1);
92
- // Don't throw - we'll use enhanced mock fallback
93
- this.initialized = true;
94
- return [3 /*break*/, 4];
95
- case 4: return [2 /*return*/];
96
- }
97
- });
98
- });
99
- };
100
- Florence2Local.prototype.analyzeImage = function (imageBuffer) {
101
- return __awaiter(this, void 0, void 0, function () {
102
- var preprocessed, predictions, error_2;
103
- return __generator(this, function (_a) {
104
- switch (_a.label) {
105
- case 0:
106
- if (!!this.initialized) return [3 /*break*/, 2];
107
- return [4 /*yield*/, this.initialize()];
108
- case 1:
109
- _a.sent();
110
- _a.label = 2;
111
- case 2:
112
- _a.trys.push([2, 8, , 10]);
113
- return [4 /*yield*/, this.preprocessImage(imageBuffer)];
114
- case 3:
115
- preprocessed = _a.sent();
116
- if (!this.model) return [3 /*break*/, 5];
117
- return [4 /*yield*/, this.runInference(preprocessed)];
118
- case 4:
119
- predictions = _a.sent();
120
- preprocessed.dispose();
121
- return [2 /*return*/, this.parseModelOutput(predictions)];
122
- case 5:
123
- // Enhanced fallback with basic image analysis
124
- preprocessed.dispose();
125
- return [4 /*yield*/, this.enhancedFallback(imageBuffer)];
126
- case 6: return [2 /*return*/, _a.sent()];
127
- case 7: return [3 /*break*/, 10];
128
- case 8:
129
- error_2 = _a.sent();
130
- core_1.logger.error('[Florence2Local] Analysis failed:', error_2);
131
- return [4 /*yield*/, this.enhancedFallback(imageBuffer)];
132
- case 9: return [2 /*return*/, _a.sent()];
133
- case 10: return [2 /*return*/];
134
- }
135
- });
136
- });
137
- };
138
- Florence2Local.prototype.preprocessImage = function (imageBuffer) {
139
- return __awaiter(this, void 0, void 0, function () {
140
- var resized, tensor, normalized;
141
- return __generator(this, function (_a) {
142
- switch (_a.label) {
143
- case 0: return [4 /*yield*/, (0, sharp_1.default)(imageBuffer)
144
- .resize(224, 224) // MobileNet uses 224x224
145
- .raw()
146
- .toBuffer()];
147
- case 1:
148
- resized = _a.sent();
149
- tensor = tf.node.decodeImage(resized, 3);
150
- normalized = tf.div(tensor, 255.0);
151
- return [2 /*return*/, normalized];
152
- }
153
- });
154
- });
155
- };
156
- Florence2Local.prototype.runInference = function (input) {
157
- return __awaiter(this, void 0, void 0, function () {
158
- var batched, output;
159
- return __generator(this, function (_a) {
160
- if (!this.model) {
161
- throw new Error('Model not loaded');
162
- }
163
- batched = input.expandDims(0);
164
- output = this.model.predict(batched);
165
- batched.dispose();
166
- return [2 /*return*/, output];
167
- });
168
- });
169
- };
170
- Florence2Local.prototype.parseModelOutput = function (predictions) {
171
- return __awaiter(this, void 0, void 0, function () {
172
- var values, caption;
173
- return __generator(this, function (_a) {
174
- switch (_a.label) {
175
- case 0: return [4 /*yield*/, predictions.array()];
176
- case 1:
177
- values = _a.sent();
178
- predictions.dispose();
179
- caption = this.generateCaptionFromFeatures(values);
180
- return [2 /*return*/, {
181
- caption: caption,
182
- objects: [], // Would be populated by actual object detection
183
- regions: [],
184
- tags: this.extractTagsFromCaption(caption),
185
- }];
186
- }
187
- });
188
- });
189
- };
190
- Florence2Local.prototype.generateCaptionFromFeatures = function (features) {
191
- // Simplified caption generation
192
- // In reality, Florence-2 would use its language model to generate captions
193
- var scenes = [
194
- 'Indoor scene with various objects visible',
195
- 'Person in a room with furniture',
196
- 'Computer workspace with monitor and desk',
197
- 'Living space with natural lighting',
198
- 'Office environment with equipment',
199
- ];
200
- // Use feature values to select most appropriate caption
201
- var index = Math.abs(features[0][0]) * scenes.length;
202
- return scenes[Math.floor(index) % scenes.length];
203
- };
204
- Florence2Local.prototype.extractTagsFromCaption = function (caption) {
205
- var words = caption.toLowerCase().split(/\s+/);
206
- var validTags = [
207
- 'indoor',
208
- 'outdoor',
209
- 'person',
210
- 'computer',
211
- 'desk',
212
- 'office',
213
- 'room',
214
- 'furniture',
215
- 'monitor',
216
- 'workspace',
217
- ];
218
- return words.filter(function (word) { return validTags.includes(word); });
219
- };
220
- Florence2Local.prototype.enhancedFallback = function (imageBuffer) {
221
- return __awaiter(this, void 0, void 0, function () {
222
- var metadata, stats, brightness, isIndoor, caption, aspectRatio, dominantColor;
223
- return __generator(this, function (_a) {
224
- switch (_a.label) {
225
- case 0: return [4 /*yield*/, (0, sharp_1.default)(imageBuffer).metadata()];
226
- case 1:
227
- metadata = _a.sent();
228
- return [4 /*yield*/, (0, sharp_1.default)(imageBuffer).stats()];
229
- case 2:
230
- stats = _a.sent();
231
- brightness = (stats.channels[0].mean + stats.channels[1].mean + stats.channels[2].mean) / 3;
232
- isIndoor = brightness < 180;
233
- caption = isIndoor ? 'Indoor scene' : 'Outdoor scene';
234
- // Add more context based on image properties
235
- if (metadata.width && metadata.height) {
236
- aspectRatio = metadata.width / metadata.height;
237
- if (aspectRatio > 1.5) {
238
- caption += ' with wide field of view';
239
- }
240
- else if (aspectRatio < 0.7) {
241
- caption += ' in portrait orientation';
242
- }
243
- }
244
- dominantColor = stats.dominant;
245
- if (dominantColor.r > 200 && dominantColor.g > 200 && dominantColor.b > 200) {
246
- caption += ', well-lit environment';
247
- }
248
- else if (dominantColor.r < 100 && dominantColor.g < 100 && dominantColor.b < 100) {
249
- caption += ', dimly lit conditions';
250
- }
251
- return [2 /*return*/, {
252
- caption: caption,
253
- objects: [],
254
- regions: [],
255
- tags: this.extractTagsFromCaption(caption),
256
- }];
257
- }
258
- });
259
- });
260
- };
261
- Florence2Local.prototype.isInitialized = function () {
262
- return this.initialized;
263
- };
264
- Florence2Local.prototype.dispose = function () {
265
- return __awaiter(this, void 0, void 0, function () {
266
- return __generator(this, function (_a) {
267
- if (this.model) {
268
- this.model.dispose();
269
- this.model = null;
270
- }
271
- this.initialized = false;
272
- core_1.logger.info('[Florence2Local] Model disposed');
273
- return [2 /*return*/];
274
- });
275
- });
276
- };
277
- return Florence2Local;
278
- }());
279
- exports.Florence2Local = Florence2Local;
280
- //# sourceMappingURL=florence2-local.js.map
@@ -1 +0,0 @@
1
- {"version":3,"file":"florence2-local.js","sourceRoot":"","sources":["../src/florence2-local.ts"],"names":[],"mappings":";;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;AAAA,0CAA4C;AAC5C,sCAAuC;AAEvC,+BAA0B;AAQ1B;IASE,wBAAY,MAA6B;QARjC,UAAK,GAAyB,IAAI,CAAC;QACnC,gBAAW,GAAG,KAAK,CAAC;QAG5B,kBAAkB;QACD,eAAU,GAAG,GAAG,CAAC,CAAC,gCAAgC;QAClD,eAAU,GAAG,KAAK,CAAC;QAGlC,IAAI,CAAC,MAAM,GAAG;YACZ,SAAS,EAAE,CAAA,MAAM,aAAN,MAAM,uBAAN,MAAM,CAAE,SAAS,KAAI,oBAAoB;YACpD,QAAQ,EACN,CAAA,MAAM,aAAN,MAAM,uBAAN,MAAM,CAAE,QAAQ;gBAChB,0EAA0E;YAC5E,QAAQ,EAAE,CAAA,MAAM,aAAN,MAAM,uBAAN,MAAM,CAAE,QAAQ,KAAI,gBAAgB;SAC/C,CAAC;IACJ,CAAC;IAEK,mCAAU,GAAhB;;;;;;wBACE,IAAI,IAAI,CAAC,WAAW,EAAE,CAAC;4BACrB,sBAAO;wBACT,CAAC;;;;wBAGC,aAAM,CAAC,IAAI,CAAC,yDAAyD,CAAC,CAAC;wBAEvE,wDAAwD;wBACxD,uEAAuE;wBACvE,8EAA8E;wBAE9E,wFAAwF;wBACxF,uEAAuE;wBACvE,oEAAoE;wBAEpE,KAAA,IAAI,CAAA;wBAAS,qBAAM,EAAE,CAAC,cAAc,CAClC,oGAAoG,CACrG,EAAA;;wBAVD,wDAAwD;wBACxD,uEAAuE;wBACvE,8EAA8E;wBAE9E,wFAAwF;wBACxF,uEAAuE;wBACvE,oEAAoE;wBAEpE,GAAK,KAAK,GAAG,SAEZ,CAAC;wBAEF,IAAI,CAAC,WAAW,GAAG,IAAI,CAAC;wBACxB,aAAM,CAAC,IAAI,CAAC,iDAAiD,CAAC,CAAC;;;;wBAE/D,aAAM,CAAC,KAAK,CAAC,8CAA8C,EAAE,OAAK,CAAC,CAAC;wBACpE,iDAAiD;wBACjD,IAAI,CAAC,WAAW,GAAG,IAAI,CAAC;;;;;;KAE3B;IAEK,qCAAY,GAAlB,UAAmB,WAAmB;;;;;;6BAChC,CAAC,IAAI,CAAC,WAAW,EAAjB,wBAAiB;wBACnB,qBAAM,IAAI,CAAC,UAAU,EAAE,EAAA;;wBAAvB,SAAuB,CAAC;;;;wBAKH,qBAAM,IAAI,CAAC,eAAe,CAAC,WAAW,CAAC,EAAA;;wBAAtD,YAAY,GAAG,SAAuC;6BAExD,IAAI,CAAC,KAAK,EAAV,wBAAU;wBAEQ,qBAAM,IAAI,CAAC,YAAY,CAAC,YAAY,CAAC,EAAA;;wBAAnD,WAAW,GAAG,SAAqC;wBACzD,YAAY,CAAC,OAAO,EAAE,CAAC;wBAEvB,sBAAO,IAAI,CAAC,gBAAgB,CAAC,WAAW,CAAC,EAAC;;wBAE1C,8CAA8C;wBAC9C,YAAY,CAAC,OAAO,EAAE,CAAC;wBAChB,qBAAM,IAAI,CAAC,gBAAgB,CAAC,WAAW,CAAC,EAAA;4BAA/C,sBAAO,SAAwC,EAAC;;;;wBAGlD,aAAM,CAAC,KAAK,CAAC,mCAAmC,EAAE,OAAK,CAAC,CAAC;wBAClD,qBAAM,IAAI,CAAC,gBAAgB,CAAC,WAAW,CAAC,EAAA;4BAA/C,sBAAO,SAAwC,EAAC;;;;;KAEnD;IAEa,wCAAe,GAA7B,UAA8B,WAAmB;;;;;4BAE/B,qBAAM,IAAA,eAAK,EAAC,WAAW,CAAC;6BACrC,MAAM,CAAC,GAAG,EAAE,GAAG,CAAC,CAAC,yBAAyB;6BAC1C,GAAG,EAAE;6BACL,QAAQ,EAAE,EAAA;;wBAHP,OAAO,GAAG,SAGH;wBAGP,MAAM,GAAG,EAAE,CAAC,IAAI,CAAC,WAAW,CAAC,OAAO,EAAE,CAAC,CAAC,CAAC;wBACzC,UAAU,GAAG,EAAE,CAAC,GAAG,CAAC,MAAM,EAAE,KAAK,CAAC,CAAC;wBAEzC,sBAAO,UAAyB,EAAC;;;;KAClC;IAEa,qCAAY,GAA1B,UAA2B,KAAkB;;;;gBAC3C,IAAI,CAAC,IAAI,CAAC,KAAK,EAAE,CAAC;oBAChB,MAAM,IAAI,KAAK,CAAC,kBAAkB,CAAC,CAAC;gBACtC,CAAC;gBAGK,OAAO,GAAG,KAAK,CAAC,UAAU,CAAC,CAAC,CAAC,CAAC;gBAG9B,MAAM,GAAG,IAAI,CAAC,KAAK,CAAC,OAAO,CAAC,OAAO,CAAc,CAAC;gBAExD,OAAO,CAAC,OAAO,EAAE,CAAC;gBAElB,sBAAO,MAAM,EAAC;;;KACf;IAEa,yCAAgB,GAA9B,UAA+B,WAAsB;;;;;4BAIpC,qBAAM,WAAW,CAAC,KAAK,EAAE,EAAA;;wBAAlC,MAAM,GAAG,SAAyB;wBACxC,WAAW,CAAC,OAAO,EAAE,CAAC;wBAGhB,OAAO,GAAG,IAAI,CAAC,2BAA2B,CAAC,MAAM,CAAC,CAAC;wBAEzD,sBAAO;gCACL,OAAO,SAAA;gCACP,OAAO,EAAE,EAAE,EAAE,gDAAgD;gCAC7D,OAAO,EAAE,EAAE;gCACX,IAAI,EAAE,IAAI,CAAC,sBAAsB,CAAC,OAAO,CAAC;6BAC3C,EAAC;;;;KACH;IAEO,oDAA2B,GAAnC,UAAoC,QAAa;QAC/C,gCAAgC;QAChC,2EAA2E;QAE3E,IAAM,MAAM,GAAG;YACb,2CAA2C;YAC3C,iCAAiC;YACjC,0CAA0C;YAC1C,oCAAoC;YACpC,mCAAmC;SACpC,CAAC;QAEF,wDAAwD;QACxD,IAAM,KAAK,GAAG,IAAI,CAAC,GAAG,CAAC,QAAQ,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,GAAG,MAAM,CAAC,MAAM,CAAC;QACvD,OAAO,MAAM,CAAC,IAAI,CAAC,KAAK,CAAC,KAAK,CAAC,GAAG,MAAM,CAAC,MAAM,CAAC,CAAC;IACnD,CAAC;IAEO,+CAAsB,GAA9B,UAA+B,OAAe;QAC5C,IAAM,KAAK,GAAG,OAAO,CAAC,WAAW,EAAE,CAAC,KAAK,CAAC,KAAK,CAAC,CAAC;QACjD,IAAM,SAAS,GAAG;YAChB,QAAQ;YACR,SAAS;YACT,QAAQ;YACR,UAAU;YACV,MAAM;YACN,QAAQ;YACR,MAAM;YACN,WAAW;YACX,SAAS;YACT,WAAW;SACZ,CAAC;QACF,OAAO,KAAK,CAAC,MAAM,CAAC,UAAC,IAAI,IAAK,OAAA,SAAS,CAAC,QAAQ,CAAC,IAAI,CAAC,EAAxB,CAAwB,CAAC,CAAC;IAC1D,CAAC;IAEa,yCAAgB,GAA9B,UAA+B,WAAmB;;;;;4BAE/B,qBAAM,IAAA,eAAK,EAAC,WAAW,CAAC,CAAC,QAAQ,EAAE,EAAA;;wBAA9C,QAAQ,GAAG,SAAmC;wBACtC,qBAAM,IAAA,eAAK,EAAC,WAAW,CAAC,CAAC,KAAK,EAAE,EAAA;;wBAAxC,KAAK,GAAG,SAAgC;wBAGxC,UAAU,GACd,CAAC,KAAK,CAAC,QAAQ,CAAC,CAAC,CAAC,CAAC,IAAI,GAAG,KAAK,CAAC,QAAQ,CAAC,CAAC,CAAC,CAAC,IAAI,GAAG,KAAK,CAAC,QAAQ,CAAC,CAAC,CAAC,CAAC,IAAI,CAAC,GAAG,CAAC,CAAC;wBAC3E,QAAQ,GAAG,UAAU,GAAG,GAAG,CAAC;wBAG9B,OAAO,GAAG,QAAQ,CAAC,CAAC,CAAC,cAAc,CAAC,CAAC,CAAC,eAAe,CAAC;wBAE1D,6CAA6C;wBAC7C,IAAI,QAAQ,CAAC,KAAK,IAAI,QAAQ,CAAC,MAAM,EAAE,CAAC;4BAChC,WAAW,GAAG,QAAQ,CAAC,KAAK,GAAG,QAAQ,CAAC,MAAM,CAAC;4BACrD,IAAI,WAAW,GAAG,GAAG,EAAE,CAAC;gCACtB,OAAO,IAAI,0BAA0B,CAAC;4BACxC,CAAC;iCAAM,IAAI,WAAW,GAAG,GAAG,EAAE,CAAC;gCAC7B,OAAO,IAAI,0BAA0B,CAAC;4BACxC,CAAC;wBACH,CAAC;wBAGK,aAAa,GAAG,KAAK,CAAC,QAAQ,CAAC;wBACrC,IAAI,aAAa,CAAC,CAAC,GAAG,GAAG,IAAI,aAAa,CAAC,CAAC,GAAG,GAAG,IAAI,aAAa,CAAC,CAAC,GAAG,GAAG,EAAE,CAAC;4BAC5E,OAAO,IAAI,wBAAwB,CAAC;wBACtC,CAAC;6BAAM,IAAI,aAAa,CAAC,CAAC,GAAG,GAAG,IAAI,aAAa,CAAC,CAAC,GAAG,GAAG,IAAI,aAAa,CAAC,CAAC,GAAG,GAAG,EAAE,CAAC;4BACnF,OAAO,IAAI,wBAAwB,CAAC;wBACtC,CAAC;wBAED,sBAAO;gCACL,OAAO,SAAA;gCACP,OAAO,EAAE,EAAE;gCACX,OAAO,EAAE,EAAE;gCACX,IAAI,EAAE,IAAI,CAAC,sBAAsB,CAAC,OAAO,CAAC;6BAC3C,EAAC;;;;KACH;IAED,sCAAa,GAAb;QACE,OAAO,IAAI,CAAC,WAAW,CAAC;IAC1B,CAAC;IAEK,gCAAO,GAAb;;;gBACE,IAAI,IAAI,CAAC,KAAK,EAAE,CAAC;oBACf,IAAI,CAAC,KAAK,CAAC,OAAO,EAAE,CAAC;oBACrB,IAAI,CAAC,KAAK,GAAG,IAAI,CAAC;gBACpB,CAAC;gBACD,IAAI,CAAC,WAAW,GAAG,KAAK,CAAC;gBACzB,aAAM,CAAC,IAAI,CAAC,iCAAiC,CAAC,CAAC;;;;KAChD;IACH,qBAAC;AAAD,CAAC,AA/MD,IA+MC;AA/MY,wCAAc","sourcesContent":["import * as tf from '@tensorflow/tfjs-node';\nimport { logger } from '@elizaos/core';\nimport type { Florence2Result } from './types';\nimport sharp from 'sharp';\n\ninterface Florence2LocalConfig {\n modelPath?: string;\n modelUrl?: string;\n cacheDir?: string;\n}\n\nexport class Florence2Local {\n private model: tf.GraphModel | null = null;\n private initialized = false;\n private config: Florence2LocalConfig;\n\n // Model constants\n private readonly IMAGE_SIZE = 384; // Florence-2 uses 384x384 input\n private readonly VOCAB_SIZE = 51289;\n\n constructor(config?: Florence2LocalConfig) {\n this.config = {\n modelPath: config?.modelPath || './models/florence2',\n modelUrl:\n config?.modelUrl ||\n 'https://huggingface.co/microsoft/Florence-2-base/resolve/main/model.json',\n cacheDir: config?.cacheDir || './models/cache',\n };\n }\n\n async initialize(): Promise<void> {\n if (this.initialized) {\n return;\n }\n\n try {\n logger.info('[Florence2Local] Initializing local Florence-2 model...');\n\n // For now, we'll use a simplified vision model approach\n // In a real implementation, you would load the actual Florence-2 model\n // Since Florence-2 is quite large and complex, we'll use a practical approach\n\n // Instead of loading the full Florence-2 model (which would require significant setup),\n // we'll use TensorFlow.js with MobileNet for basic image understanding\n // and combine it with other models for a Florence-2-like experience\n\n this.model = await tf.loadGraphModel(\n 'https://tfhub.dev/google/tfjs-model/imagenet/mobilenet_v3_small_100_224/feature_vector/5/default/1'\n );\n\n this.initialized = true;\n logger.info('[Florence2Local] Model initialized successfully');\n } catch (error) {\n logger.error('[Florence2Local] Failed to initialize model:', error);\n // Don't throw - we'll use enhanced mock fallback\n this.initialized = true;\n }\n }\n\n async analyzeImage(imageBuffer: Buffer): Promise<Florence2Result> {\n if (!this.initialized) {\n await this.initialize();\n }\n\n try {\n // Preprocess image\n const preprocessed = await this.preprocessImage(imageBuffer);\n\n if (this.model) {\n // Run inference\n const predictions = await this.runInference(preprocessed);\n preprocessed.dispose();\n\n return this.parseModelOutput(predictions);\n } else {\n // Enhanced fallback with basic image analysis\n preprocessed.dispose();\n return await this.enhancedFallback(imageBuffer);\n }\n } catch (error) {\n logger.error('[Florence2Local] Analysis failed:', error);\n return await this.enhancedFallback(imageBuffer);\n }\n }\n\n private async preprocessImage(imageBuffer: Buffer): Promise<tf.Tensor3D> {\n // Resize and normalize image for model input\n const resized = await sharp(imageBuffer)\n .resize(224, 224) // MobileNet uses 224x224\n .raw()\n .toBuffer();\n\n // Convert to tensor and normalize\n const tensor = tf.node.decodeImage(resized, 3);\n const normalized = tf.div(tensor, 255.0);\n\n return normalized as tf.Tensor3D;\n }\n\n private async runInference(input: tf.Tensor3D): Promise<tf.Tensor> {\n if (!this.model) {\n throw new Error('Model not loaded');\n }\n\n // Add batch dimension\n const batched = input.expandDims(0);\n\n // Run model\n const output = this.model.predict(batched) as tf.Tensor;\n\n batched.dispose();\n\n return output;\n }\n\n private async parseModelOutput(predictions: tf.Tensor): Promise<Florence2Result> {\n // Since we're using MobileNet as a placeholder, we'll create a basic caption\n // In a real Florence-2 implementation, this would decode the model's actual output\n\n const values = await predictions.array();\n predictions.dispose();\n\n // Generate a basic caption based on feature analysis\n const caption = this.generateCaptionFromFeatures(values);\n\n return {\n caption,\n objects: [], // Would be populated by actual object detection\n regions: [],\n tags: this.extractTagsFromCaption(caption),\n };\n }\n\n private generateCaptionFromFeatures(features: any): string {\n // Simplified caption generation\n // In reality, Florence-2 would use its language model to generate captions\n\n const scenes = [\n 'Indoor scene with various objects visible',\n 'Person in a room with furniture',\n 'Computer workspace with monitor and desk',\n 'Living space with natural lighting',\n 'Office environment with equipment',\n ];\n\n // Use feature values to select most appropriate caption\n const index = Math.abs(features[0][0]) * scenes.length;\n return scenes[Math.floor(index) % scenes.length];\n }\n\n private extractTagsFromCaption(caption: string): string[] {\n const words = caption.toLowerCase().split(/\\s+/);\n const validTags = [\n 'indoor',\n 'outdoor',\n 'person',\n 'computer',\n 'desk',\n 'office',\n 'room',\n 'furniture',\n 'monitor',\n 'workspace',\n ];\n return words.filter((word) => validTags.includes(word));\n }\n\n private async enhancedFallback(imageBuffer: Buffer): Promise<Florence2Result> {\n // Analyze image properties for better fallback\n const metadata = await sharp(imageBuffer).metadata();\n const stats = await sharp(imageBuffer).stats();\n\n // Determine scene type based on image characteristics\n const brightness =\n (stats.channels[0].mean + stats.channels[1].mean + stats.channels[2].mean) / 3;\n const isIndoor = brightness < 180; // Simplified heuristic\n\n // Generate contextual caption\n let caption = isIndoor ? 'Indoor scene' : 'Outdoor scene';\n\n // Add more context based on image properties\n if (metadata.width && metadata.height) {\n const aspectRatio = metadata.width / metadata.height;\n if (aspectRatio > 1.5) {\n caption += ' with wide field of view';\n } else if (aspectRatio < 0.7) {\n caption += ' in portrait orientation';\n }\n }\n\n // Detect dominant colors for additional context\n const dominantColor = stats.dominant;\n if (dominantColor.r > 200 && dominantColor.g > 200 && dominantColor.b > 200) {\n caption += ', well-lit environment';\n } else if (dominantColor.r < 100 && dominantColor.g < 100 && dominantColor.b < 100) {\n caption += ', dimly lit conditions';\n }\n\n return {\n caption,\n objects: [],\n regions: [],\n tags: this.extractTagsFromCaption(caption),\n };\n }\n\n isInitialized(): boolean {\n return this.initialized;\n }\n\n async dispose(): Promise<void> {\n if (this.model) {\n this.model.dispose();\n this.model = null;\n }\n this.initialized = false;\n logger.info('[Florence2Local] Model disposed');\n }\n}\n"]}
@@ -1,36 +0,0 @@
1
- import type { Florence2Result, ScreenTile, BoundingBox } from './types';
2
- export declare class Florence2Model {
3
- private initialized;
4
- private localModel;
5
- constructor();
6
- initialize(): Promise<void>;
7
- analyzeTile(tile: ScreenTile): Promise<Florence2Result>;
8
- analyzeImage(imageBuffer: Buffer): Promise<Florence2Result>;
9
- private mockAnalyze;
10
- detectUIElements(imageBuffer: Buffer): Promise<Array<{
11
- type: string;
12
- bbox: BoundingBox;
13
- confidence: number;
14
- text?: string;
15
- }>>;
16
- private mockAnalyzeBuffer;
17
- private mapToUIElementType;
18
- generateSceneGraph(tiles: ScreenTile[]): Promise<{
19
- nodes: Array<{
20
- id: string;
21
- type: string;
22
- label: string;
23
- position: BoundingBox;
24
- }>;
25
- edges: Array<{
26
- source: string;
27
- target: string;
28
- relation: string;
29
- }>;
30
- }>;
31
- private inferSpatialRelation;
32
- private contains;
33
- private overlaps;
34
- isInitialized(): boolean;
35
- dispose(): Promise<void>;
36
- }